knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
## ─ Attaching packages ────────────────────────── tidyverse 1.3.0 ─
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.1     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ─ Conflicts ─────────────────────────── tidyverse_conflicts() ─
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(gapminder)
library(socviz)
library(ggrepel)

はじめに

glimpse(gapminder)
## Rows: 1,704
## Columns: 6
## $ country   <fct> Afghanistan, Afghanistan, Afghanistan, Afghanistan, Afghan…
## $ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia…
## $ year      <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997…
## $ lifeExp   <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40…
## $ pop       <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, …
## $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134…
p <- ggplot(data = gapminder, 
            mapping = aes(x = log(gdpPercap), y = lifeExp))

# 線形回帰モデルとロバスト線形回帰モデル
p + geom_point(alpha = 0.1) + 
    geom_smooth(color = "tomato", fill = "tomato", method = MASS::rlm) + 
    geom_smooth(color = "steelblue", fill = "steelblue", method = "lm")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

# 多項式回帰モデル
p + geom_point(alpha = 0.1) + 
    geom_smooth(color = "red", fill = "red", method = "lm", size = 1.2,
                formula = y ~ splines::bs(x, df = 3))

# 分位点回帰モデル
p + geom_point(alpha = 0.1) + 
    geom_quantile(color = "red", size = 1.2, method = "rqss",
                  lambda = 1, quantils = c(0.20, 0.5, 0.85))
## Warning: Ignoring unknown parameters: quantils
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 1)

6.1 複数の回帰直線を凡例付きで一度に図示する

複数の回帰直線をそれぞれ別の設定で重ねることは可能. しかしながら,凡例はデータの内部で結びついているわけではないから.

  1. geom_smooth()関数中のcolorとfill引数に文字列でモデル名をマッピングする
  2. scale_color_manual()関数とscale_fill_manual()関数を組み合わせて判例をつける
# 色の獲得
model_colors <- RColorBrewer::brewer.pal(3, "Set1")
model_colors
## [1] "#E41A1C" "#377EB8" "#4DAF4A"
p0 <- ggplot(data = gapminder,
             mapping = aes(x = log(gdpPercap), y = lifeExp))
p1 <- p0 + 
      geom_point(alpha = 0.2) + 
      geom_smooth(method = "lm", aes(color = "OLS", fill = "OLS")) + 
      geom_smooth(method = "lm", formula = y ~ splines::bs(x, df = 3),
                  aes(color = "Cubic Spline", fill = "Cubic Spline")) + 
      geom_smooth(method = "loess", aes(color = "loess", fill = "loess"))
p1 + scale_color_manual(name = "Models", values = model_colors) + 
     scale_fill_manual(name = "Models", values = model_colors) + 
     theme(legend.position = "top")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

6.2 モデルオブジェクトの中身を確認する

本書では,データにあてはまるモデルを選択し,ggplotを使って情報を抽出,可視化する方法のみを述べる

Rでは常にオブジェクトを操作している. オブジェクトは名前付きの部分構造(数値やベクトル,formulaなど)を持っているので,簡単にアクセスできる

gapminder
## # A tibble: 1,704 x 6
##    country     continent  year lifeExp      pop gdpPercap
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Afghanistan Asia       1952    28.8  8425333      779.
##  2 Afghanistan Asia       1957    30.3  9240934      821.
##  3 Afghanistan Asia       1962    32.0 10267083      853.
##  4 Afghanistan Asia       1967    34.0 11537966      836.
##  5 Afghanistan Asia       1972    36.1 13079460      740.
##  6 Afghanistan Asia       1977    38.4 14880372      786.
##  7 Afghanistan Asia       1982    39.9 12881816      978.
##  8 Afghanistan Asia       1987    40.8 13867957      852.
##  9 Afghanistan Asia       1992    41.7 16317921      649.
## 10 Afghanistan Asia       1997    41.8 22227415      635.
## # … with 1,694 more rows
str(gapminder)
## tibble [1,704 × 6] (S3: tbl_df/tbl/data.frame)
##  $ country  : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ year     : int [1:1704] 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##  $ lifeExp  : num [1:1704] 28.8 30.3 32 34 36.1 ...
##  $ pop      : int [1:1704] 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
##  $ gdpPercap: num [1:1704] 779 821 853 836 740 ...

Rの統計モデルオブジェクトも内部にモデルの結果を格納しているが,複雑である

# 国と調査年の間に構造的な関係がある場合,線形モデルは正しくない?
out <- lm(formula = lifeExp ~ gdpPercap + pop + continent,
          data = gapminder)
# formula記法
# 従属変数 ~ 独立変数

# summary()関数を使うことでモデルの概要がわかる
summary(out)
## 
## Call:
## lm(formula = lifeExp ~ gdpPercap + pop + continent, data = gapminder)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.161  -4.486   0.297   5.110  25.175 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       4.781e+01  3.395e-01 140.819  < 2e-16 ***
## gdpPercap         4.495e-04  2.346e-05  19.158  < 2e-16 ***
## pop               6.570e-09  1.975e-09   3.326 0.000901 ***
## continentAmericas 1.348e+01  6.000e-01  22.458  < 2e-16 ***
## continentAsia     8.193e+00  5.712e-01  14.342  < 2e-16 ***
## continentEurope   1.747e+01  6.246e-01  27.973  < 2e-16 ***
## continentOceania  1.808e+01  1.782e+00  10.146  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.365 on 1697 degrees of freedom
## Multiple R-squared:  0.5821, Adjusted R-squared:  0.5806 
## F-statistic: 393.9 on 6 and 1697 DF,  p-value: < 2.2e-16
# str()関数を使うことでモデルオブジェクトの構造がわかる
str(out)
## List of 13
##  $ coefficients : Named num [1:7] 4.78e+01 4.50e-04 6.57e-09 1.35e+01 8.19 ...
##   ..- attr(*, "names")= chr [1:7] "(Intercept)" "gdpPercap" "pop" "continentAmericas" ...
##  $ residuals    : Named num [1:1704] -27.6 -26.1 -24.5 -22.4 -20.3 ...
##   ..- attr(*, "names")= chr [1:1704] "1" "2" "3" "4" ...
##  $ effects      : Named num [1:1704] -2455.1 311.1 42.6 101.1 -17.2 ...
##   ..- attr(*, "names")= chr [1:1704] "(Intercept)" "gdpPercap" "pop" "continentAmericas" ...
##  $ rank         : int 7
##  $ fitted.values: Named num [1:1704] 56.4 56.4 56.5 56.5 56.4 ...
##   ..- attr(*, "names")= chr [1:1704] "1" "2" "3" "4" ...
##  $ assign       : int [1:7] 0 1 2 3 3 3 3
##  $ qr           :List of 5
##   ..$ qr   : num [1:1704, 1:7] -41.2795 0.0242 0.0242 0.0242 0.0242 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:1704] "1" "2" "3" "4" ...
##   .. .. ..$ : chr [1:7] "(Intercept)" "gdpPercap" "pop" "continentAmericas" ...
##   .. ..- attr(*, "assign")= int [1:7] 0 1 2 3 3 3 3
##   .. ..- attr(*, "contrasts")=List of 1
##   .. .. ..$ continent: chr "contr.treatment"
##   ..$ qraux: num [1:7] 1.02 1.02 1 1.01 1.04 ...
##   ..$ pivot: int [1:7] 1 2 3 4 5 6 7
##   ..$ tol  : num 1e-07
##   ..$ rank : int 7
##   ..- attr(*, "class")= chr "qr"
##  $ df.residual  : int 1697
##  $ contrasts    :List of 1
##   ..$ continent: chr "contr.treatment"
##  $ xlevels      :List of 1
##   ..$ continent: chr [1:5] "Africa" "Americas" "Asia" "Europe" ...
##  $ call         : language lm(formula = lifeExp ~ gdpPercap + pop + continent, data = gapminder)
##  $ terms        :Classes 'terms', 'formula'  language lifeExp ~ gdpPercap + pop + continent
##   .. ..- attr(*, "variables")= language list(lifeExp, gdpPercap, pop, continent)
##   .. ..- attr(*, "factors")= int [1:4, 1:3] 0 1 0 0 0 0 1 0 0 0 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:4] "lifeExp" "gdpPercap" "pop" "continent"
##   .. .. .. ..$ : chr [1:3] "gdpPercap" "pop" "continent"
##   .. ..- attr(*, "term.labels")= chr [1:3] "gdpPercap" "pop" "continent"
##   .. ..- attr(*, "order")= int [1:3] 1 1 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(lifeExp, gdpPercap, pop, continent)
##   .. ..- attr(*, "dataClasses")= Named chr [1:4] "numeric" "numeric" "numeric" "factor"
##   .. .. ..- attr(*, "names")= chr [1:4] "lifeExp" "gdpPercap" "pop" "continent"
##  $ model        :'data.frame':   1704 obs. of  4 variables:
##   ..$ lifeExp  : num [1:1704] 28.8 30.3 32 34 36.1 ...
##   ..$ gdpPercap: num [1:1704] 779 821 853 836 740 ...
##   ..$ pop      : int [1:1704] 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
##   ..$ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
##   ..- attr(*, "terms")=Classes 'terms', 'formula'  language lifeExp ~ gdpPercap + pop + continent
##   .. .. ..- attr(*, "variables")= language list(lifeExp, gdpPercap, pop, continent)
##   .. .. ..- attr(*, "factors")= int [1:4, 1:3] 0 1 0 0 0 0 1 0 0 0 ...
##   .. .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. .. ..$ : chr [1:4] "lifeExp" "gdpPercap" "pop" "continent"
##   .. .. .. .. ..$ : chr [1:3] "gdpPercap" "pop" "continent"
##   .. .. ..- attr(*, "term.labels")= chr [1:3] "gdpPercap" "pop" "continent"
##   .. .. ..- attr(*, "order")= int [1:3] 1 1 1
##   .. .. ..- attr(*, "intercept")= int 1
##   .. .. ..- attr(*, "response")= int 1
##   .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. .. ..- attr(*, "predvars")= language list(lifeExp, gdpPercap, pop, continent)
##   .. .. ..- attr(*, "dataClasses")= Named chr [1:4] "numeric" "numeric" "numeric" "factor"
##   .. .. .. ..- attr(*, "names")= chr [1:4] "lifeExp" "gdpPercap" "pop" "continent"
##  - attr(*, "class")= chr "lm"

6.3 モデルから図に使えるデータを正しく抽出する

モデルの結果を効果的に可視化するのは難しい. なぜならばモデルの結果を示す際にはデータの背景知識に基づく知見とモデルの解釈の両方が必要になるから

モデルの推定値の図示はモデルの適切な推定と深く関わっているため,統計をしっかり学ぶ以外に作図のスキルをあげる方法はありません. モデルの中身を理解しないまま可視化するのはやめる

6.3.1 適切な用語による調査結果の説明

モデルの可視化を適切に行えれば,分析で得ようとしている問題に対して実質的に意味があり,かつ直接結果を解釈できる図が得られる. 解釈可能な結果を示したいなら,読み手が容易に理解できる尺度を使う必要がある.

6.3.2 信頼度の可視化

結果の不確実性や信頼度をはっきりと可視化する図を作るのは難しい. モデル推定にはさまざまな指標(精度,信頼区間,信用区間,有意性など)が用いられるが, これらの情報が本来持っている情報量以上に過信する傾向があり, 解釈を誤ったり,結果から言えること以上のことを言ってしまいがち

6.3.3 どんなデータを可視化するのか

多変量モデルの結果の図示. - 回帰係数の氷河実際にどういう意味なのかを,有意性や傾きの大きさを示し,重要度で分類する - 単純なモデルの回帰係数ではなく,関心のある範囲におけるいくつかの変数の予測値を示すこと - 元データの上から,モデルの推定値を元データに組み合わせて,可視化する

6.4 予測の図示

Rで用いられる関数はさまざまなオブジェクトに対して汎用的に使うことができる.

predict()関数: モデルオブジェクトから予測値を生成する関数

# モデルの作成
out <- lm(formula = lifeExp ~ gdpPercap + pop + continent,
          data = gapminder)

# 予測するための新しいデータの作成
min_gdp <- min(gapminder$gdpPercap)
max_gdp <- max(gapminder$gdpPercap)
med_pop <- median(gapminder$pop)

# 他らしいデータの作成
pred_df <- expand.grid(gdpPercap = (seq(from = min_gdp,
                                        to = max_gdp,
                                        length.out = 100)),
                       pop = med_pop,
                       continent = c("Africa", "Americas",
                                     "Asia", "Europe", "Oceania"))
dim(pred_df)
## [1] 500   3
head(pred_df)
##   gdpPercap     pop continent
## 1  241.1659 7023596    Africa
## 2 1385.4282 7023596    Africa
## 3 2529.6905 7023596    Africa
## 4 3673.9528 7023596    Africa
## 5 4818.2150 7023596    Africa
## 6 5962.4773 7023596    Africa
# 予測
pred_out <- predict(object = out,
                    newdata = pred_df,
                    interval = "predict")
head(pred_out)
##        fit      lwr      upr
## 1 47.96863 31.54775 64.38951
## 2 48.48298 32.06231 64.90365
## 3 48.99733 32.57670 65.41797
## 4 49.51169 33.09092 65.93245
## 5 50.02604 33.60497 66.44711
## 6 50.54039 34.11885 66.96193
# 予測のためのデータと予測結果を結合させる
pred_df <- cbind(pred_df, pred_out)
pred_df
##       gdpPercap     pop continent       fit      lwr       upr
## 1      241.1659 7023596    Africa  47.96863 31.54775  64.38951
## 2     1385.4282 7023596    Africa  48.48298 32.06231  64.90365
## 3     2529.6905 7023596    Africa  48.99733 32.57670  65.41797
## 4     3673.9528 7023596    Africa  49.51169 33.09092  65.93245
## 5     4818.2150 7023596    Africa  50.02604 33.60497  66.44711
## 6     5962.4773 7023596    Africa  50.54039 34.11885  66.96193
## 7     7106.7396 7023596    Africa  51.05474 34.63256  67.47692
## 8     8251.0019 7023596    Africa  51.56910 35.14611  67.99208
## 9     9395.2642 7023596    Africa  52.08345 35.65949  68.50741
## 10   10539.5265 7023596    Africa  52.59780 36.17269  69.02291
## 11   11683.7888 7023596    Africa  53.11215 36.68573  69.53857
## 12   12828.0511 7023596    Africa  53.62651 37.19860  70.05441
## 13   13972.3134 7023596    Africa  54.14086 37.71130  70.57041
## 14   15116.5757 7023596    Africa  54.65521 38.22384  71.08658
## 15   16260.8380 7023596    Africa  55.16956 38.73620  71.60292
## 16   17405.1003 7023596    Africa  55.68391 39.24840  72.11943
## 17   18549.3626 7023596    Africa  56.19827 39.76042  72.63611
## 18   19693.6249 7023596    Africa  56.71262 40.27228  73.15296
## 19   20837.8872 7023596    Africa  57.22697 40.78397  73.66997
## 20   21982.1494 7023596    Africa  57.74132 41.29550  74.18715
## 21   23126.4117 7023596    Africa  58.25568 41.80685  74.70450
## 22   24270.6740 7023596    Africa  58.77003 42.31804  75.22202
## 23   25414.9363 7023596    Africa  59.28438 42.82906  75.73971
## 24   26559.1986 7023596    Africa  59.79873 43.33991  76.25756
## 25   27703.4609 7023596    Africa  60.31309 43.85059  76.77558
## 26   28847.7232 7023596    Africa  60.82744 44.36111  77.29377
## 27   29991.9855 7023596    Africa  61.34179 44.87145  77.81213
## 28   31136.2478 7023596    Africa  61.85614 45.38163  78.33066
## 29   32280.5101 7023596    Africa  62.37050 45.89165  78.84935
## 30   33424.7724 7023596    Africa  62.88485 46.40149  79.36821
## 31   34569.0347 7023596    Africa  63.39920 46.91117  79.88723
## 32   35713.2970 7023596    Africa  63.91355 47.42069  80.40642
## 33   36857.5593 7023596    Africa  64.42791 47.93003  80.92578
## 34   38001.8216 7023596    Africa  64.94226 48.43921  81.44531
## 35   39146.0838 7023596    Africa  65.45661 48.94822  81.96500
## 36   40290.3461 7023596    Africa  65.97096 49.45707  82.48486
## 37   41434.6084 7023596    Africa  66.48532 49.96575  83.00488
## 38   42578.8707 7023596    Africa  66.99967 50.47427  83.52507
## 39   43723.1330 7023596    Africa  67.51402 50.98262  84.04543
## 40   44867.3953 7023596    Africa  68.02837 51.49080  84.56595
## 41   46011.6576 7023596    Africa  68.54273 51.99882  85.08664
## 42   47155.9199 7023596    Africa  69.05708 52.50667  85.60749
## 43   48300.1822 7023596    Africa  69.57143 53.01436  86.12850
## 44   49444.4445 7023596    Africa  70.08578 53.52188  86.64968
## 45   50588.7068 7023596    Africa  70.60014 54.02924  87.17103
## 46   51732.9691 7023596    Africa  71.11449 54.53644  87.69254
## 47   52877.2314 7023596    Africa  71.62884 55.04347  88.21421
## 48   54021.4937 7023596    Africa  72.14319 55.55034  88.73605
## 49   55165.7559 7023596    Africa  72.65755 56.05704  89.25805
## 50   56310.0182 7023596    Africa  73.17190 56.56358  89.78022
## 51   57454.2805 7023596    Africa  73.68625 57.06996  90.30255
## 52   58598.5428 7023596    Africa  74.20060 57.57617  90.82504
## 53   59742.8051 7023596    Africa  74.71496 58.08222  91.34769
## 54   60887.0674 7023596    Africa  75.22931 58.58811  91.87051
## 55   62031.3297 7023596    Africa  75.74366 59.09384  92.39349
## 56   63175.5920 7023596    Africa  76.25801 59.59940  92.91663
## 57   64319.8543 7023596    Africa  76.77237 60.10480  93.43993
## 58   65464.1166 7023596    Africa  77.28672 60.61004  93.96340
## 59   66608.3789 7023596    Africa  77.80107 61.11512  94.48702
## 60   67752.6412 7023596    Africa  78.31542 61.62004  95.01081
## 61   68896.9035 7023596    Africa  78.82978 62.12480  95.53475
## 62   70041.1658 7023596    Africa  79.34413 62.62940  96.05886
## 63   71185.4281 7023596    Africa  79.85848 63.13384  96.58313
## 64   72329.6903 7023596    Africa  80.37283 63.63811  97.10756
## 65   73473.9526 7023596    Africa  80.88719 64.14223  97.63214
## 66   74618.2149 7023596    Africa  81.40154 64.64619  98.15689
## 67   75762.4772 7023596    Africa  81.91589 65.14999  98.68179
## 68   76906.7395 7023596    Africa  82.43024 65.65363  99.20686
## 69   78051.0018 7023596    Africa  82.94460 66.15712  99.73208
## 70   79195.2641 7023596    Africa  83.45895 66.66044 100.25746
## 71   80339.5264 7023596    Africa  83.97330 67.16361 100.78300
## 72   81483.7887 7023596    Africa  84.48765 67.66662 101.30869
## 73   82628.0510 7023596    Africa  85.00201 68.16947 101.83454
## 74   83772.3133 7023596    Africa  85.51636 68.67217 102.36055
## 75   84916.5756 7023596    Africa  86.03071 69.17471 102.88672
## 76   86060.8379 7023596    Africa  86.54506 69.67709 103.41304
## 77   87205.1002 7023596    Africa  87.05942 70.17932 103.93952
## 78   88349.3625 7023596    Africa  87.57377 70.68139 104.46615
## 79   89493.6247 7023596    Africa  88.08812 71.18331 104.99294
## 80   90637.8870 7023596    Africa  88.60247 71.68507 105.51988
## 81   91782.1493 7023596    Africa  89.11683 72.18668 106.04698
## 82   92926.4116 7023596    Africa  89.63118 72.68813 106.57423
## 83   94070.6739 7023596    Africa  90.14553 73.18943 107.10163
## 84   95214.9362 7023596    Africa  90.65988 73.69058 107.62919
## 85   96359.1985 7023596    Africa  91.17424 74.19157 108.15690
## 86   97503.4608 7023596    Africa  91.68859 74.69241 108.68477
## 87   98647.7231 7023596    Africa  92.20294 75.19310 109.21278
## 88   99791.9854 7023596    Africa  92.71729 75.69364 109.74095
## 89  100936.2477 7023596    Africa  93.23165 76.19402 110.26927
## 90  102080.5100 7023596    Africa  93.74600 76.69426 110.79774
## 91  103224.7723 7023596    Africa  94.26035 77.19434 111.32636
## 92  104369.0346 7023596    Africa  94.77470 77.69427 111.85513
## 93  105513.2968 7023596    Africa  95.28906 78.19406 112.38406
## 94  106657.5591 7023596    Africa  95.80341 78.69369 112.91313
## 95  107801.8214 7023596    Africa  96.31776 79.19317 113.44235
## 96  108946.0837 7023596    Africa  96.83211 79.69251 113.97172
## 97  110090.3460 7023596    Africa  97.34647 80.19170 114.50124
## 98  111234.6083 7023596    Africa  97.86082 80.69073 115.03090
## 99  112378.8706 7023596    Africa  98.37517 81.18962 115.56072
## 100 113523.1329 7023596    Africa  98.88952 81.68837 116.09068
## 101    241.1659 7023596  Americas  61.44457 45.00649  77.88265
## 102   1385.4282 7023596  Americas  61.95892 45.52179  78.39606
## 103   2529.6905 7023596  Americas  62.47328 46.03691  78.90964
## 104   3673.9528 7023596  Americas  62.98763 46.55187  79.42338
## 105   4818.2150 7023596  Americas  63.50198 47.06666  79.93730
## 106   5962.4773 7023596  Americas  64.01633 47.58129  80.45138
## 107   7106.7396 7023596  Americas  64.53069 48.09574  80.96563
## 108   8251.0019 7023596  Americas  65.04504 48.61002  81.48005
## 109   9395.2642 7023596  Americas  65.55939 49.12414  81.99464
## 110  10539.5265 7023596  Americas  66.07374 49.63809  82.50940
## 111  11683.7888 7023596  Americas  66.58810 50.15186  83.02433
## 112  12828.0511 7023596  Americas  67.10245 50.66547  83.53942
## 113  13972.3134 7023596  Americas  67.61680 51.17891  84.05469
## 114  15116.5757 7023596  Americas  68.13115 51.69219  84.57012
## 115  16260.8380 7023596  Americas  68.64551 52.20529  85.08572
## 116  17405.1003 7023596  Americas  69.15986 52.71822  85.60149
## 117  18549.3626 7023596  Americas  69.67421 53.23099  86.11743
## 118  19693.6249 7023596  Americas  70.18856 53.74359  86.63354
## 119  20837.8872 7023596  Americas  70.70292 54.25602  87.14981
## 120  21982.1494 7023596  Americas  71.21727 54.76828  87.66626
## 121  23126.4117 7023596  Americas  71.73162 55.28037  88.18287
## 122  24270.6740 7023596  Americas  72.24597 55.79230  88.69965
## 123  25414.9363 7023596  Americas  72.76033 56.30405  89.21660
## 124  26559.1986 7023596  Americas  73.27468 56.81564  89.73371
## 125  27703.4609 7023596  Americas  73.78903 57.32706  90.25100
## 126  28847.7232 7023596  Americas  74.30338 57.83832  90.76845
## 127  29991.9855 7023596  Americas  74.81774 58.34940  91.28607
## 128  31136.2478 7023596  Americas  75.33209 58.86032  91.80386
## 129  32280.5101 7023596  Americas  75.84644 59.37107  92.32181
## 130  33424.7724 7023596  Americas  76.36079 59.88165  92.83994
## 131  34569.0347 7023596  Americas  76.87515 60.39206  93.35823
## 132  35713.2970 7023596  Americas  77.38950 60.90231  93.87669
## 133  36857.5593 7023596  Americas  77.90385 61.41239  94.39531
## 134  38001.8216 7023596  Americas  78.41820 61.92230  94.91410
## 135  39146.0838 7023596  Americas  78.93256 62.43205  95.43306
## 136  40290.3461 7023596  Americas  79.44691 62.94163  95.95219
## 137  41434.6084 7023596  Americas  79.96126 63.45104  96.47148
## 138  42578.8707 7023596  Americas  80.47561 63.96029  96.99094
## 139  43723.1330 7023596  Americas  80.98997 64.46937  97.51056
## 140  44867.3953 7023596  Americas  81.50432 64.97829  98.03035
## 141  46011.6576 7023596  Americas  82.01867 65.48703  98.55031
## 142  47155.9199 7023596  Americas  82.53302 65.99562  99.07043
## 143  48300.1822 7023596  Americas  83.04738 66.50403  99.59072
## 144  49444.4445 7023596  Americas  83.56173 67.01228 100.11117
## 145  50588.7068 7023596  Americas  84.07608 67.52037 100.63179
## 146  51732.9691 7023596  Americas  84.59043 68.02829 101.15257
## 147  52877.2314 7023596  Americas  85.10479 68.53605 101.67352
## 148  54021.4937 7023596  Americas  85.61914 69.04364 102.19464
## 149  55165.7559 7023596  Americas  86.13349 69.55107 102.71591
## 150  56310.0182 7023596  Americas  86.64784 70.05833 103.23736
## 151  57454.2805 7023596  Americas  87.16220 70.56543 103.75896
## 152  58598.5428 7023596  Americas  87.67655 71.07236 104.28073
## 153  59742.8051 7023596  Americas  88.19090 71.57914 104.80266
## 154  60887.0674 7023596  Americas  88.70525 72.08574 105.32476
## 155  62031.3297 7023596  Americas  89.21961 72.59219 105.84702
## 156  63175.5920 7023596  Americas  89.73396 73.09847 106.36944
## 157  64319.8543 7023596  Americas  90.24831 73.60459 106.89203
## 158  65464.1166 7023596  Americas  90.76266 74.11055 107.41478
## 159  66608.3789 7023596  Americas  91.27702 74.61634 107.93769
## 160  67752.6412 7023596  Americas  91.79137 75.12197 108.46076
## 161  68896.9035 7023596  Americas  92.30572 75.62745 108.98399
## 162  70041.1658 7023596  Americas  92.82007 76.13276 109.50739
## 163  71185.4281 7023596  Americas  93.33443 76.63790 110.03095
## 164  72329.6903 7023596  Americas  93.84878 77.14289 110.55466
## 165  73473.9526 7023596  Americas  94.36313 77.64772 111.07854
## 166  74618.2149 7023596  Americas  94.87748 78.15238 111.60258
## 167  75762.4772 7023596  Americas  95.39184 78.65689 112.12678
## 168  76906.7395 7023596  Americas  95.90619 79.16124 112.65114
## 169  78051.0018 7023596  Americas  96.42054 79.66542 113.17566
## 170  79195.2641 7023596  Americas  96.93489 80.16945 113.70033
## 171  80339.5264 7023596  Americas  97.44925 80.67332 114.22517
## 172  81483.7887 7023596  Americas  97.96360 81.17703 114.75016
## 173  82628.0510 7023596  Americas  98.47795 81.68058 115.27532
## 174  83772.3133 7023596  Americas  98.99230 82.18398 115.80063
## 175  84916.5756 7023596  Americas  99.50666 82.68721 116.32610
## 176  86060.8379 7023596  Americas 100.02101 83.19029 116.85172
## 177  87205.1002 7023596  Americas 100.53536 83.69321 117.37751
## 178  88349.3625 7023596  Americas 101.04971 84.19598 117.90345
## 179  89493.6247 7023596  Americas 101.56406 84.69859 118.42954
## 180  90637.8870 7023596  Americas 102.07842 85.20104 118.95580
## 181  91782.1493 7023596  Americas 102.59277 85.70334 119.48220
## 182  92926.4116 7023596  Americas 103.10712 86.20548 120.00877
## 183  94070.6739 7023596  Americas 103.62147 86.70746 120.53549
## 184  95214.9362 7023596  Americas 104.13583 87.20929 121.06236
## 185  96359.1985 7023596  Americas 104.65018 87.71097 121.58939
## 186  97503.4608 7023596  Americas 105.16453 88.21249 122.11657
## 187  98647.7231 7023596  Americas 105.67888 88.71386 122.64391
## 188  99791.9854 7023596  Americas 106.19324 89.21508 123.17140
## 189 100936.2477 7023596  Americas 106.70759 89.71614 123.69904
## 190 102080.5100 7023596  Americas 107.22194 90.21705 124.22684
## 191 103224.7723 7023596  Americas 107.73629 90.71781 124.75478
## 192 104369.0346 7023596  Americas 108.25065 91.21841 125.28288
## 193 105513.2968 7023596  Americas 108.76500 91.71886 125.81114
## 194 106657.5591 7023596  Americas 109.27935 92.21917 126.33954
## 195 107801.8214 7023596  Americas 109.79370 92.71932 126.86809
## 196 108946.0837 7023596  Americas 110.30806 93.21932 127.39679
## 197 110090.3460 7023596  Americas 110.82241 93.71917 127.92565
## 198 111234.6083 7023596  Americas 111.33676 94.21887 128.45465
## 199 112378.8706 7023596  Americas 111.85111 94.71842 128.98380
## 200 113523.1329 7023596  Americas 112.36547 95.21783 129.51311
## 201    241.1659 7023596      Asia  56.16126 39.72673  72.59578
## 202   1385.4282 7023596      Asia  56.67561 40.24218  73.10904
## 203   2529.6905 7023596      Asia  57.18996 40.75746  73.62247
## 204   3673.9528 7023596      Asia  57.70432 41.27256  74.13607
## 205   4818.2150 7023596      Asia  58.21867 41.78750  74.64984
## 206   5962.4773 7023596      Asia  58.73302 42.30227  75.16377
## 207   7106.7396 7023596      Asia  59.24737 42.81687  75.67788
## 208   8251.0019 7023596      Asia  59.76173 43.33131  76.19215
## 209   9395.2642 7023596      Asia  60.27608 43.84557  76.70659
## 210  10539.5265 7023596      Asia  60.79043 44.35967  77.22120
## 211  11683.7888 7023596      Asia  61.30478 44.87359  77.73598
## 212  12828.0511 7023596      Asia  61.81914 45.38735  78.25092
## 213  13972.3134 7023596      Asia  62.33349 45.90094  78.76604
## 214  15116.5757 7023596      Asia  62.84784 46.41436  79.28133
## 215  16260.8380 7023596      Asia  63.36219 46.92761  79.79678
## 216  17405.1003 7023596      Asia  63.87655 47.44069  80.31240
## 217  18549.3626 7023596      Asia  64.39090 47.95361  80.82819
## 218  19693.6249 7023596      Asia  64.90525 48.46635  81.34415
## 219  20837.8872 7023596      Asia  65.41960 48.97893  81.86028
## 220  21982.1494 7023596      Asia  65.93396 49.49134  82.37658
## 221  23126.4117 7023596      Asia  66.44831 50.00358  82.89304
## 222  24270.6740 7023596      Asia  66.96266 50.51565  83.40967
## 223  25414.9363 7023596      Asia  67.47701 51.02755  83.92647
## 224  26559.1986 7023596      Asia  67.99137 51.53929  84.44344
## 225  27703.4609 7023596      Asia  68.50572 52.05086  84.96058
## 226  28847.7232 7023596      Asia  69.02007 52.56226  85.47789
## 227  29991.9855 7023596      Asia  69.53442 53.07349  85.99536
## 228  31136.2478 7023596      Asia  70.04878 53.58455  86.51300
## 229  32280.5101 7023596      Asia  70.56313 54.09545  87.03081
## 230  33424.7724 7023596      Asia  71.07748 54.60618  87.54879
## 231  34569.0347 7023596      Asia  71.59183 55.11674  88.06693
## 232  35713.2970 7023596      Asia  72.10619 55.62713  88.58524
## 233  36857.5593 7023596      Asia  72.62054 56.13736  89.10372
## 234  38001.8216 7023596      Asia  73.13489 56.64741  89.62237
## 235  39146.0838 7023596      Asia  73.64924 57.15731  90.14118
## 236  40290.3461 7023596      Asia  74.16360 57.66703  90.66016
## 237  41434.6084 7023596      Asia  74.67795 58.17659  91.17931
## 238  42578.8707 7023596      Asia  75.19230 58.68598  91.69862
## 239  43723.1330 7023596      Asia  75.70665 59.19521  92.21810
## 240  44867.3953 7023596      Asia  76.22101 59.70427  92.73775
## 241  46011.6576 7023596      Asia  76.73536 60.21316  93.25756
## 242  47155.9199 7023596      Asia  77.24971 60.72189  93.77754
## 243  48300.1822 7023596      Asia  77.76406 61.23045  94.29768
## 244  49444.4445 7023596      Asia  78.27842 61.73884  94.81799
## 245  50588.7068 7023596      Asia  78.79277 62.24707  95.33847
## 246  51732.9691 7023596      Asia  79.30712 62.75514  95.85911
## 247  52877.2314 7023596      Asia  79.82147 63.26304  96.37991
## 248  54021.4937 7023596      Asia  80.33583 63.77077  96.90088
## 249  55165.7559 7023596      Asia  80.85018 64.27834  97.42202
## 250  56310.0182 7023596      Asia  81.36453 64.78575  97.94332
## 251  57454.2805 7023596      Asia  81.87888 65.29299  98.46478
## 252  58598.5428 7023596      Asia  82.39324 65.80007  98.98641
## 253  59742.8051 7023596      Asia  82.90759 66.30698  99.50820
## 254  60887.0674 7023596      Asia  83.42194 66.81373 100.03015
## 255  62031.3297 7023596      Asia  83.93629 67.32031 100.55227
## 256  63175.5920 7023596      Asia  84.45065 67.82674 101.07456
## 257  64319.8543 7023596      Asia  84.96500 68.33300 101.59700
## 258  65464.1166 7023596      Asia  85.47935 68.83910 102.11961
## 259  66608.3789 7023596      Asia  85.99370 69.34503 102.64238
## 260  67752.6412 7023596      Asia  86.50806 69.85080 103.16531
## 261  68896.9035 7023596      Asia  87.02241 70.35641 103.68840
## 262  70041.1658 7023596      Asia  87.53676 70.86186 104.21166
## 263  71185.4281 7023596      Asia  88.05111 71.36715 104.73508
## 264  72329.6903 7023596      Asia  88.56547 71.87228 105.25866
## 265  73473.9526 7023596      Asia  89.07982 72.37724 105.78239
## 266  74618.2149 7023596      Asia  89.59417 72.88205 106.30629
## 267  75762.4772 7023596      Asia  90.10852 73.38669 106.83036
## 268  76906.7395 7023596      Asia  90.62288 73.89118 107.35458
## 269  78051.0018 7023596      Asia  91.13723 74.39550 107.87896
## 270  79195.2641 7023596      Asia  91.65158 74.89967 108.40350
## 271  80339.5264 7023596      Asia  92.16593 75.40367 108.92820
## 272  81483.7887 7023596      Asia  92.68029 75.90752 109.45305
## 273  82628.0510 7023596      Asia  93.19464 76.41121 109.97807
## 274  83772.3133 7023596      Asia  93.70899 76.91474 110.50325
## 275  84916.5756 7023596      Asia  94.22334 77.41811 111.02858
## 276  86060.8379 7023596      Asia  94.73770 77.92132 111.55407
## 277  87205.1002 7023596      Asia  95.25205 78.42438 112.07972
## 278  88349.3625 7023596      Asia  95.76640 78.92728 112.60552
## 279  89493.6247 7023596      Asia  96.28075 79.43002 113.13148
## 280  90637.8870 7023596      Asia  96.79511 79.93261 113.65760
## 281  91782.1493 7023596      Asia  97.30946 80.43504 114.18388
## 282  92926.4116 7023596      Asia  97.82381 80.93731 114.71031
## 283  94070.6739 7023596      Asia  98.33816 81.43943 115.23689
## 284  95214.9362 7023596      Asia  98.85252 81.94140 115.76364
## 285  96359.1985 7023596      Asia  99.36687 82.44321 116.29053
## 286  97503.4608 7023596      Asia  99.88122 82.94486 116.81758
## 287  98647.7231 7023596      Asia 100.39557 83.44636 117.34479
## 288  99791.9854 7023596      Asia 100.90993 83.94771 117.87214
## 289 100936.2477 7023596      Asia 101.42428 84.44890 118.39966
## 290 102080.5100 7023596      Asia 101.93863 84.94994 118.92732
## 291 103224.7723 7023596      Asia 102.45298 85.45083 119.45514
## 292 104369.0346 7023596      Asia 102.96734 85.95156 119.98311
## 293 105513.2968 7023596      Asia 103.48169 86.45215 120.51123
## 294 106657.5591 7023596      Asia 103.99604 86.95258 121.03950
## 295 107801.8214 7023596      Asia 104.51039 87.45286 121.56793
## 296 108946.0837 7023596      Asia 105.02475 87.95299 122.09650
## 297 110090.3460 7023596      Asia 105.53910 88.45297 122.62523
## 298 111234.6083 7023596      Asia 106.05345 88.95280 123.15410
## 299 112378.8706 7023596      Asia 106.56780 89.45248 123.68313
## 300 113523.1329 7023596      Asia 107.08216 89.95201 124.21230
## 301    241.1659 7023596    Europe  65.44132 48.99789  81.88475
## 302   1385.4282 7023596    Europe  65.95567 49.51426  82.39708
## 303   2529.6905 7023596    Europe  66.47003 50.03047  82.90959
## 304   3673.9528 7023596    Europe  66.98438 50.54650  83.42226
## 305   4818.2150 7023596    Europe  67.49873 51.06237  83.93509
## 306   5962.4773 7023596    Europe  68.01308 51.57806  84.44810
## 307   7106.7396 7023596    Europe  68.52744 52.09359  84.96128
## 308   8251.0019 7023596    Europe  69.04179 52.60896  85.47462
## 309   9395.2642 7023596    Europe  69.55614 53.12415  85.98813
## 310  10539.5265 7023596    Europe  70.07049 53.63917  86.50182
## 311  11683.7888 7023596    Europe  70.58485 54.15402  87.01567
## 312  12828.0511 7023596    Europe  71.09920 54.66871  87.52968
## 313  13972.3134 7023596    Europe  71.61355 55.18323  88.04387
## 314  15116.5757 7023596    Europe  72.12790 55.69758  88.55823
## 315  16260.8380 7023596    Europe  72.64226 56.21176  89.07276
## 316  17405.1003 7023596    Europe  73.15661 56.72577  89.58745
## 317  18549.3626 7023596    Europe  73.67096 57.23961  90.10231
## 318  19693.6249 7023596    Europe  74.18531 57.75328  90.61734
## 319  20837.8872 7023596    Europe  74.69967 58.26679  91.13254
## 320  21982.1494 7023596    Europe  75.21402 58.78012  91.64791
## 321  23126.4117 7023596    Europe  75.72837 59.29329  92.16345
## 322  24270.6740 7023596    Europe  76.24272 59.80629  92.67916
## 323  25414.9363 7023596    Europe  76.75708 60.31912  93.19503
## 324  26559.1986 7023596    Europe  77.27143 60.83178  93.71108
## 325  27703.4609 7023596    Europe  77.78578 61.34427  94.22729
## 326  28847.7232 7023596    Europe  78.30013 61.85660  94.74367
## 327  29991.9855 7023596    Europe  78.81449 62.36875  95.26022
## 328  31136.2478 7023596    Europe  79.32884 62.88074  95.77693
## 329  32280.5101 7023596    Europe  79.84319 63.39256  96.29382
## 330  33424.7724 7023596    Europe  80.35754 63.90421  96.81087
## 331  34569.0347 7023596    Europe  80.87190 64.41570  97.32810
## 332  35713.2970 7023596    Europe  81.38625 64.92701  97.84548
## 333  36857.5593 7023596    Europe  81.90060 65.43816  98.36304
## 334  38001.8216 7023596    Europe  82.41495 65.94914  98.88077
## 335  39146.0838 7023596    Europe  82.92931 66.45995  99.39866
## 336  40290.3461 7023596    Europe  83.44366 66.97059  99.91672
## 337  41434.6084 7023596    Europe  83.95801 67.48107 100.43495
## 338  42578.8707 7023596    Europe  84.47236 67.99138 100.95334
## 339  43723.1330 7023596    Europe  84.98671 68.50152 101.47191
## 340  44867.3953 7023596    Europe  85.50107 69.01150 101.99064
## 341  46011.6576 7023596    Europe  86.01542 69.52131 102.50953
## 342  47155.9199 7023596    Europe  86.52977 70.03095 103.02860
## 343  48300.1822 7023596    Europe  87.04412 70.54042 103.54783
## 344  49444.4445 7023596    Europe  87.55848 71.04973 104.06722
## 345  50588.7068 7023596    Europe  88.07283 71.55887 104.58678
## 346  51732.9691 7023596    Europe  88.58718 72.06785 105.10651
## 347  52877.2314 7023596    Europe  89.10153 72.57666 105.62641
## 348  54021.4937 7023596    Europe  89.61589 73.08530 106.14647
## 349  55165.7559 7023596    Europe  90.13024 73.59378 106.66670
## 350  56310.0182 7023596    Europe  90.64459 74.10210 107.18709
## 351  57454.2805 7023596    Europe  91.15894 74.61024 107.70765
## 352  58598.5428 7023596    Europe  91.67330 75.11822 108.22837
## 353  59742.8051 7023596    Europe  92.18765 75.62604 108.74926
## 354  60887.0674 7023596    Europe  92.70200 76.13369 109.27031
## 355  62031.3297 7023596    Europe  93.21635 76.64118 109.79153
## 356  63175.5920 7023596    Europe  93.73071 77.14851 110.31291
## 357  64319.8543 7023596    Europe  94.24506 77.65566 110.83445
## 358  65464.1166 7023596    Europe  94.75941 78.16266 111.35616
## 359  66608.3789 7023596    Europe  95.27376 78.66949 111.87804
## 360  67752.6412 7023596    Europe  95.78812 79.17616 112.40008
## 361  68896.9035 7023596    Europe  96.30247 79.68266 112.92228
## 362  70041.1658 7023596    Europe  96.81682 80.18901 113.44464
## 363  71185.4281 7023596    Europe  97.33117 80.69518 113.96716
## 364  72329.6903 7023596    Europe  97.84553 81.20120 114.48985
## 365  73473.9526 7023596    Europe  98.35988 81.70705 115.01270
## 366  74618.2149 7023596    Europe  98.87423 82.21275 115.53572
## 367  75762.4772 7023596    Europe  99.38858 82.71828 116.05889
## 368  76906.7395 7023596    Europe  99.90294 83.22364 116.58223
## 369  78051.0018 7023596    Europe 100.41729 83.72885 117.10573
## 370  79195.2641 7023596    Europe 100.93164 84.23390 117.62939
## 371  80339.5264 7023596    Europe 101.44599 84.73878 118.15321
## 372  81483.7887 7023596    Europe 101.96035 85.24351 118.67719
## 373  82628.0510 7023596    Europe 102.47470 85.74807 119.20133
## 374  83772.3133 7023596    Europe 102.98905 86.25248 119.72563
## 375  84916.5756 7023596    Europe 103.50340 86.75672 120.25009
## 376  86060.8379 7023596    Europe 104.01776 87.26081 120.77471
## 377  87205.1002 7023596    Europe 104.53211 87.76473 121.29949
## 378  88349.3625 7023596    Europe 105.04646 88.26850 121.82442
## 379  89493.6247 7023596    Europe 105.56081 88.77211 122.34952
## 380  90637.8870 7023596    Europe 106.07517 89.27556 122.87477
## 381  91782.1493 7023596    Europe 106.58952 89.77885 123.40019
## 382  92926.4116 7023596    Europe 107.10387 90.28199 123.92576
## 383  94070.6739 7023596    Europe 107.61822 90.78497 124.45148
## 384  95214.9362 7023596    Europe 108.13258 91.28779 124.97737
## 385  96359.1985 7023596    Europe 108.64693 91.79045 125.50341
## 386  97503.4608 7023596    Europe 109.16128 92.29296 126.02960
## 387  98647.7231 7023596    Europe 109.67563 92.79531 126.55596
## 388  99791.9854 7023596    Europe 110.18999 93.29751 127.08246
## 389 100936.2477 7023596    Europe 110.70434 93.79955 127.60913
## 390 102080.5100 7023596    Europe 111.21869 94.30144 128.13595
## 391 103224.7723 7023596    Europe 111.73304 94.80317 128.66292
## 392 104369.0346 7023596    Europe 112.24740 95.30475 129.19005
## 393 105513.2968 7023596    Europe 112.76175 95.80617 129.71733
## 394 106657.5591 7023596    Europe 113.27610 96.30744 130.24476
## 395 107801.8214 7023596    Europe 113.79045 96.80856 130.77235
## 396 108946.0837 7023596    Europe 114.30481 97.30952 131.30009
## 397 110090.3460 7023596    Europe 114.81916 97.81033 131.82799
## 398 111234.6083 7023596    Europe 115.33351 98.31099 132.35603
## 399 112378.8706 7023596    Europe 115.84786 98.81150 132.88423
## 400 113523.1329 7023596    Europe 116.36222 99.31186 133.41258
## 401    241.1659 7023596   Oceania  66.05193 49.28474  82.81912
## 402   1385.4282 7023596   Oceania  66.56628 49.80167  83.33090
## 403   2529.6905 7023596   Oceania  67.08064 50.31843  83.84284
## 404   3673.9528 7023596   Oceania  67.59499 50.83503  84.35495
## 405   4818.2150 7023596   Oceania  68.10934 51.35146  84.86722
## 406   5962.4773 7023596   Oceania  68.62369 51.86773  85.37966
## 407   7106.7396 7023596   Oceania  69.13805 52.38383  85.89226
## 408   8251.0019 7023596   Oceania  69.65240 52.89977  86.40503
## 409   9395.2642 7023596   Oceania  70.16675 53.41554  86.91796
## 410  10539.5265 7023596   Oceania  70.68110 53.93115  87.43106
## 411  11683.7888 7023596   Oceania  71.19546 54.44659  87.94433
## 412  12828.0511 7023596   Oceania  71.70981 54.96186  88.45776
## 413  13972.3134 7023596   Oceania  72.22416 55.47697  88.97135
## 414  15116.5757 7023596   Oceania  72.73851 55.99191  89.48511
## 415  16260.8380 7023596   Oceania  73.25287 56.50669  89.99904
## 416  17405.1003 7023596   Oceania  73.76722 57.02130  90.51313
## 417  18549.3626 7023596   Oceania  74.28157 57.53575  91.02739
## 418  19693.6249 7023596   Oceania  74.79592 58.05003  91.54182
## 419  20837.8872 7023596   Oceania  75.31028 58.56415  92.05641
## 420  21982.1494 7023596   Oceania  75.82463 59.07810  92.57116
## 421  23126.4117 7023596   Oceania  76.33898 59.59188  93.08608
## 422  24270.6740 7023596   Oceania  76.85333 60.10550  93.60117
## 423  25414.9363 7023596   Oceania  77.36769 60.61896  94.11642
## 424  26559.1986 7023596   Oceania  77.88204 61.13224  94.63183
## 425  27703.4609 7023596   Oceania  78.39639 61.64537  95.14742
## 426  28847.7232 7023596   Oceania  78.91074 62.15832  95.66316
## 427  29991.9855 7023596   Oceania  79.42510 62.67112  96.17908
## 428  31136.2478 7023596   Oceania  79.93945 63.18374  96.69516
## 429  32280.5101 7023596   Oceania  80.45380 63.69620  97.21140
## 430  33424.7724 7023596   Oceania  80.96815 64.20850  97.72781
## 431  34569.0347 7023596   Oceania  81.48251 64.72063  98.24438
## 432  35713.2970 7023596   Oceania  81.99686 65.23259  98.76112
## 433  36857.5593 7023596   Oceania  82.51121 65.74440  99.27803
## 434  38001.8216 7023596   Oceania  83.02556 66.25603  99.79510
## 435  39146.0838 7023596   Oceania  83.53992 66.76750 100.31233
## 436  40290.3461 7023596   Oceania  84.05427 67.27881 100.82973
## 437  41434.6084 7023596   Oceania  84.56862 67.78995 101.34729
## 438  42578.8707 7023596   Oceania  85.08297 68.30093 101.86502
## 439  43723.1330 7023596   Oceania  85.59733 68.81174 102.38292
## 440  44867.3953 7023596   Oceania  86.11168 69.32238 102.90097
## 441  46011.6576 7023596   Oceania  86.62603 69.83287 103.41919
## 442  47155.9199 7023596   Oceania  87.14038 70.34319 103.93758
## 443  48300.1822 7023596   Oceania  87.65474 70.85334 104.45613
## 444  49444.4445 7023596   Oceania  88.16909 71.36333 104.97484
## 445  50588.7068 7023596   Oceania  88.68344 71.87316 105.49372
## 446  51732.9691 7023596   Oceania  89.19779 72.38282 106.01276
## 447  52877.2314 7023596   Oceania  89.71215 72.89232 106.53197
## 448  54021.4937 7023596   Oceania  90.22650 73.40166 107.05134
## 449  55165.7559 7023596   Oceania  90.74085 73.91083 107.57087
## 450  56310.0182 7023596   Oceania  91.25520 74.41984 108.09056
## 451  57454.2805 7023596   Oceania  91.76956 74.92869 108.61042
## 452  58598.5428 7023596   Oceania  92.28391 75.43738 109.13044
## 453  59742.8051 7023596   Oceania  92.79826 75.94590 109.65063
## 454  60887.0674 7023596   Oceania  93.31261 76.45426 110.17097
## 455  62031.3297 7023596   Oceania  93.82697 76.96245 110.69148
## 456  63175.5920 7023596   Oceania  94.34132 77.47049 111.21215
## 457  64319.8543 7023596   Oceania  94.85567 77.97836 111.73298
## 458  65464.1166 7023596   Oceania  95.37002 78.48607 112.25397
## 459  66608.3789 7023596   Oceania  95.88438 78.99362 112.77513
## 460  67752.6412 7023596   Oceania  96.39873 79.50101 113.29645
## 461  68896.9035 7023596   Oceania  96.91308 80.00824 113.81792
## 462  70041.1658 7023596   Oceania  97.42743 80.51530 114.33956
## 463  71185.4281 7023596   Oceania  97.94179 81.02221 114.86136
## 464  72329.6903 7023596   Oceania  98.45614 81.52895 115.38332
## 465  73473.9526 7023596   Oceania  98.97049 82.03554 115.90544
## 466  74618.2149 7023596   Oceania  99.48484 82.54196 116.42772
## 467  75762.4772 7023596   Oceania  99.99920 83.04823 116.95016
## 468  76906.7395 7023596   Oceania 100.51355 83.55433 117.47276
## 469  78051.0018 7023596   Oceania 101.02790 84.06028 117.99552
## 470  79195.2641 7023596   Oceania 101.54225 84.56606 118.51844
## 471  80339.5264 7023596   Oceania 102.05661 85.07169 119.04152
## 472  81483.7887 7023596   Oceania 102.57096 85.57716 119.56476
## 473  82628.0510 7023596   Oceania 103.08531 86.08247 120.08815
## 474  83772.3133 7023596   Oceania 103.59966 86.58762 120.61170
## 475  84916.5756 7023596   Oceania 104.11402 87.09262 121.13542
## 476  86060.8379 7023596   Oceania 104.62837 87.59745 121.65928
## 477  87205.1002 7023596   Oceania 105.14272 88.10213 122.18331
## 478  88349.3625 7023596   Oceania 105.65707 88.60665 122.70749
## 479  89493.6247 7023596   Oceania 106.17143 89.11102 123.23183
## 480  90637.8870 7023596   Oceania 106.68578 89.61523 123.75633
## 481  91782.1493 7023596   Oceania 107.20013 90.11928 124.28098
## 482  92926.4116 7023596   Oceania 107.71448 90.62318 124.80579
## 483  94070.6739 7023596   Oceania 108.22884 91.12692 125.33076
## 484  95214.9362 7023596   Oceania 108.74319 91.63050 125.85588
## 485  96359.1985 7023596   Oceania 109.25754 92.13393 126.38115
## 486  97503.4608 7023596   Oceania 109.77189 92.63720 126.90658
## 487  98647.7231 7023596   Oceania 110.28625 93.14032 127.43217
## 488  99791.9854 7023596   Oceania 110.80060 93.64329 127.95791
## 489 100936.2477 7023596   Oceania 111.31495 94.14610 128.48380
## 490 102080.5100 7023596   Oceania 111.82930 94.64876 129.00985
## 491 103224.7723 7023596   Oceania 112.34366 95.15127 129.53605
## 492 104369.0346 7023596   Oceania 112.85801 95.65362 130.06240
## 493 105513.2968 7023596   Oceania 113.37236 96.15582 130.58890
## 494 106657.5591 7023596   Oceania 113.88671 96.65786 131.11556
## 495 107801.8214 7023596   Oceania 114.40107 97.15976 131.64237
## 496 108946.0837 7023596   Oceania 114.91542 97.66150 132.16933
## 497 110090.3460 7023596   Oceania 115.42977 98.16309 132.69645
## 498 111234.6083 7023596   Oceania 115.94412 98.66453 133.22371
## 499 112378.8706 7023596   Oceania 116.45848 99.16582 133.75113
## 500 113523.1329 7023596   Oceania 116.97283 99.66696 134.27869
# モデルの結果を表示する
# アフリカとヨーロッパだけ

p <- ggplot(data = subset(pred_df, continent %in% c("Africa", "Europe")),
            mapping = aes(x = gdpPercap, y = fit, 
                          ymin = lwr, ymax = upr,
                          color = continent, fill = continent,
                          group = continent))
p + geom_point(data = subset(gapminder, continent %in% c("Africa", "Europe")),
               mapping = aes(x = gdpPercap, y = lifeExp,
                             color = continent),
               alpha = 0.5, inherit.aes = FALSE) + 
    geom_line() + 
    geom_ribbon(alpha = 0.2, color = NA) + 
    scale_x_log10(labels = scales::dollar)

実際にpredict()関数を使うよりも便利なパッケージを使うことが多い. しかしながら,predict()a関数は様々なモデルに対して安全に動作するので, predict()関数を理解するのは重要.

6.5 brromパッケージによるtidyなモデルオブジェクトの取り扱い

broomパッケージ - Rが生成したモデルから作図に使える数値を抽出するための関数を集めたパッケージ - 主な用途はモデルオブジェクトをggplotで使いやすいデータフレームに変換すること 1. モデル自体の構成要素に関わる情報(回帰係数やt検定量など) 2. モデルと元データとの関係を表す観測ベースの情報(各観測値の近似値や残差) 3. モデルの当てはまりに関する情報(F統計量,モデルの逸脱度,決定係数など)

library(broom)

6.5.1 tidy()関数によるモデルの構成要素レベルの情報の抽出

# 線形モデルの作成
out <- lm(formula = lifeExp ~ gdpPercap + pop + continent,
          data = gapminder)
summary(out)
## 
## Call:
## lm(formula = lifeExp ~ gdpPercap + pop + continent, data = gapminder)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.161  -4.486   0.297   5.110  25.175 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       4.781e+01  3.395e-01 140.819  < 2e-16 ***
## gdpPercap         4.495e-04  2.346e-05  19.158  < 2e-16 ***
## pop               6.570e-09  1.975e-09   3.326 0.000901 ***
## continentAmericas 1.348e+01  6.000e-01  22.458  < 2e-16 ***
## continentAsia     8.193e+00  5.712e-01  14.342  < 2e-16 ***
## continentEurope   1.747e+01  6.246e-01  27.973  < 2e-16 ***
## continentOceania  1.808e+01  1.782e+00  10.146  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.365 on 1697 degrees of freedom
## Multiple R-squared:  0.5821, Adjusted R-squared:  0.5806 
## F-statistic: 393.9 on 6 and 1697 DF,  p-value: < 2.2e-16
# モデルの構成要素レベルの抽出
out_comp <- tidy(out)
out_comp
## # A tibble: 7 x 5
##   term              estimate     std.error statistic   p.value
##   <chr>                <dbl>         <dbl>     <dbl>     <dbl>
## 1 (Intercept)        4.78e+1 0.340            141.   0.       
## 2 gdpPercap          4.50e-4 0.0000235         19.2  3.24e- 74
## 3 pop                6.57e-9 0.00000000198      3.33 9.01e-  4
## 4 continentAmericas  1.35e+1 0.600             22.5  5.19e- 98
## 5 continentAsia      8.19e+0 0.571             14.3  4.06e- 44
## 6 continentEurope    1.75e+1 0.625             28.0  6.34e-142
## 7 continentOceania   1.81e+1 1.78              10.1  1.59e- 23
# round_df: socvizパッケージ
out_comp %>% as.data.frame() %>% round_df()
##                term estimate std.error statistic p.value
## 1       (Intercept)    47.81      0.34    140.82       0
## 2         gdpPercap     0.00      0.00     19.16       0
## 3               pop     0.00      0.00      3.33       0
## 4 continentAmericas    13.48      0.60     22.46       0
## 5     continentAsia     8.19      0.57     14.34       0
## 6   continentEurope    17.47      0.62     27.97       0
## 7  continentOceania    18.08      1.78     10.15       0
#線形回帰モデルの推定料の図示
p <- ggplot(data = out_comp,
            mapping = aes(x = term, y = estimate))
p + geom_point() + 
    coord_flip()

# 推定値の信頼区間を求める
out_conf <- tidy(out, conf.int = TRUE)
out_conf %>% as.data.frame() %>% round_df()
##                term estimate std.error statistic p.value conf.low conf.high
## 1       (Intercept)    47.81      0.34    140.82       0    47.15     48.48
## 2         gdpPercap     0.00      0.00     19.16       0     0.00      0.00
## 3               pop     0.00      0.00      3.33       0     0.00      0.00
## 4 continentAmericas    13.48      0.60     22.46       0    12.30     14.65
## 5     continentAsia     8.19      0.57     14.34       0     7.07      9.31
## 6   continentEurope    17.47      0.62     27.97       0    16.25     18.70
## 7  continentOceania    18.08      1.78     10.15       0    14.59     21.58
# 切片を削除する
out_conf <- subset(out_conf, term %nin% "(Intercept)")
# 大陸名を調整する
out_conf$nicelabs <- prefix_strip(out_conf$term, "continent")
out_conf
## # A tibble: 6 x 8
##   term      estimate  std.error statistic   p.value conf.low  conf.high nicelabs
##   <chr>        <dbl>      <dbl>     <dbl>     <dbl>    <dbl>      <dbl> <chr>   
## 1 gdpPercap  4.50e-4    2.35e-5     19.2  3.24e- 74  4.03e-4    4.96e-4 gdpPerc…
## 2 pop        6.57e-9    1.98e-9      3.33 9.01e-  4  2.70e-9    1.04e-8 Pop     
## 3 continen…  1.35e+1    6.00e-1     22.5  5.19e- 98  1.23e+1    1.47e+1 Americas
## 4 continen…  8.19e+0    5.71e-1     14.3  4.06e- 44  7.07e+0    9.31e+0 Asia    
## 5 continen…  1.75e+1    6.25e-1     28.0  6.34e-142  1.62e+1    1.87e+1 Europe  
## 6 continen…  1.81e+1    1.78e+0     10.1  1.59e- 23  1.46e+1    2.16e+1 Oceania
p <- ggplot(data = out_conf,
            mapping = aes(x = reorder(nicelabs, estimate),
                          y = estimate,
                          ymin = conf.low, ymax = conf.high))
p + geom_pointrange() + 
    coord_flip()

6.5.2 augment()関数による観測要素レベルのモデル情報の抽出

augment()関数: 元データの観測レベルから計算される統計量.変数名の先頭には.がついている

out_aug <- augment(out)
head(out_aug) %>% as.data.frame() %>% round_df()
##   lifeExp gdpPercap      pop continent .fitted .resid .hat .sigma .cooksd
## 1   28.80    779.45  8425333      Asia   56.41 -27.61    0   8.34    0.01
## 2   30.33    820.85  9240934      Asia   56.44 -26.10    0   8.34    0.00
## 3   32.00    853.10 10267083      Asia   56.46 -24.46    0   8.35    0.00
## 4   34.02    836.20 11537966      Asia   56.46 -22.44    0   8.35    0.00
## 5   36.09    739.98 13079460      Asia   56.43 -20.34    0   8.35    0.00
## 6   38.44    786.11 14880372      Asia   56.46 -18.02    0   8.36    0.00
##   .std.resid
## 1      -3.31
## 2      -3.13
## 3      -2.93
## 4      -2.69
## 5      -2.44
## 6      -2.16
# augment()関数の引数にdataを指定すると,変数全てを追加できる
out_aug <- augment(out, data = gapminder)
head(out_aug) %>% as.data.frame() %>% round_df()
##       country continent year lifeExp      pop gdpPercap .fitted .resid .hat
## 1 Afghanistan      Asia 1952   28.80  8425333    779.45   56.41 -27.61    0
## 2 Afghanistan      Asia 1957   30.33  9240934    820.85   56.44 -26.10    0
## 3 Afghanistan      Asia 1962   32.00 10267083    853.10   56.46 -24.46    0
## 4 Afghanistan      Asia 1967   34.02 11537966    836.20   56.46 -22.44    0
## 5 Afghanistan      Asia 1972   36.09 13079460    739.98   56.43 -20.34    0
## 6 Afghanistan      Asia 1977   38.44 14880372    786.11   56.46 -18.02    0
##   .sigma .cooksd .std.resid
## 1   8.34    0.01      -3.31
## 2   8.34    0.00      -3.13
## 3   8.35    0.00      -2.93
## 4   8.35    0.00      -2.69
## 5   8.35    0.00      -2.44
## 6   8.36    0.00      -2.16
# 予測値 vs 残差プロット
p <- ggplot(data = out_aug,
            mapping = aes(x = .fitted, y = .resid))
p + geom_point()

6.5.3 glance()関数によるモデルレベルの情報の抽出

glance()関数: モデルオブジェクトにsummary()関数を適用した時の結果を整理するための関数. 真の力はデータをグループ化したり,モデルの一部を抽出してスケールアップができたりする点にある

glance(out)
## # A tibble: 1 x 12
##   r.squared adj.r.squared sigma statistic   p.value    df logLik    AIC    BIC
##       <dbl>         <dbl> <dbl>     <dbl>     <dbl> <dbl>  <dbl>  <dbl>  <dbl>
## 1     0.582         0.581  8.37      394. 3.94e-317     6 -6034. 12084. 12127.
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

しかし,broomパッケージのtidy(),augment(),glance()関数は全てのクラスのモデルに対して,全ての機能を使えるわけではない

library(survival)
head(lung)
##   inst time status age sex ph.ecog ph.karno pat.karno meal.cal wt.loss
## 1    3  306      2  74   1       1       90       100     1175      NA
## 2    3  455      2  68   1       0       90        90     1225      15
## 3    3 1010      1  56   1       0       90        90       NA      15
## 4    5  210      2  57   1       1       90        60     1150      11
## 5    1  883      2  60   1       0      100        90       NA       0
## 6   12 1022      1  74   1       1       50        80      513       0
?lung
# Surv()関数を使ってCox比例ハザードモデルの応答変数,アウトカムの変数を作成し,
# 次のcpxph()関数で予測値を算出する
out_cph <- coxph(Surv(time, status) ~ age + sex,
                 data = lung)

# survfit()関数をつかって,モデルから生存曲線を作成する
out_surv <- survfit(out_cph)
summary(out_surv)
## Call: survfit(formula = out_cph)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     5    228       1   0.9958 0.00417       0.9877        1.000
##    11    227       3   0.9833 0.00831       0.9671        1.000
##    12    224       1   0.9791 0.00928       0.9611        0.997
##    13    223       2   0.9706 0.01096       0.9494        0.992
##    15    221       1   0.9664 0.01170       0.9438        0.990
##    26    220       1   0.9622 0.01240       0.9382        0.987
##    30    219       1   0.9579 0.01305       0.9327        0.984
##    31    218       1   0.9537 0.01368       0.9273        0.981
##    53    217       2   0.9452 0.01484       0.9165        0.975
##    54    215       1   0.9409 0.01538       0.9112        0.972
##    59    214       1   0.9366 0.01590       0.9060        0.968
##    60    213       2   0.9280 0.01689       0.8955        0.962
##    61    211       1   0.9237 0.01735       0.8903        0.958
##    62    210       1   0.9194 0.01780       0.8852        0.955
##    65    209       2   0.9109 0.01865       0.8751        0.948
##    71    207       1   0.9066 0.01906       0.8700        0.945
##    79    206       1   0.9023 0.01945       0.8650        0.941
##    81    205       2   0.8937 0.02020       0.8550        0.934
##    88    203       2   0.8851 0.02091       0.8451        0.927
##    92    201       1   0.8809 0.02126       0.8402        0.924
##    93    199       1   0.8766 0.02159       0.8352        0.920
##    95    198       2   0.8679 0.02224       0.8254        0.913
##   105    196       1   0.8636 0.02256       0.8205        0.909
##   107    194       2   0.8549 0.02317       0.8107        0.902
##   110    192       1   0.8506 0.02346       0.8058        0.898
##   116    191       1   0.8462 0.02375       0.8009        0.894
##   118    190       1   0.8419 0.02403       0.7961        0.890
##   122    189       1   0.8375 0.02431       0.7912        0.887
##   131    188       1   0.8332 0.02458       0.7863        0.883
##   132    187       2   0.8244 0.02510       0.7767        0.875
##   135    185       1   0.8201 0.02535       0.7719        0.871
##   142    184       1   0.8157 0.02560       0.7671        0.867
##   144    183       1   0.8113 0.02584       0.7622        0.864
##   145    182       2   0.8026 0.02631       0.7527        0.856
##   147    180       1   0.7982 0.02654       0.7479        0.852
##   153    179       1   0.7939 0.02676       0.7431        0.848
##   156    178       2   0.7852 0.02719       0.7336        0.840
##   163    176       3   0.7720 0.02780       0.7194        0.828
##   166    173       2   0.7632 0.02819       0.7099        0.821
##   167    171       1   0.7588 0.02837       0.7052        0.817
##   170    170       1   0.7545 0.02856       0.7005        0.813
##   175    167       1   0.7500 0.02874       0.6958        0.809
##   176    165       1   0.7456 0.02892       0.6910        0.804
##   177    164       1   0.7411 0.02910       0.6862        0.800
##   179    162       2   0.7321 0.02946       0.6766        0.792
##   180    160       1   0.7276 0.02963       0.6717        0.788
##   181    159       2   0.7185 0.02996       0.6621        0.780
##   182    157       1   0.7140 0.03012       0.6573        0.776
##   183    156       1   0.7095 0.03028       0.6525        0.771
##   186    154       1   0.7049 0.03044       0.6477        0.767
##   189    152       1   0.7003 0.03059       0.6428        0.763
##   194    149       1   0.6957 0.03075       0.6379        0.759
##   197    147       1   0.6910 0.03091       0.6330        0.754
##   199    145       1   0.6863 0.03106       0.6280        0.750
##   201    144       2   0.6769 0.03136       0.6181        0.741
##   202    142       1   0.6722 0.03150       0.6132        0.737
##   207    139       1   0.6674 0.03165       0.6082        0.732
##   208    138       1   0.6627 0.03179       0.6032        0.728
##   210    137       1   0.6579 0.03192       0.5982        0.724
##   212    135       1   0.6531 0.03206       0.5932        0.719
##   218    134       1   0.6483 0.03219       0.5882        0.715
##   222    132       1   0.6435 0.03232       0.5832        0.710
##   223    130       1   0.6386 0.03246       0.5781        0.706
##   226    126       1   0.6336 0.03260       0.5728        0.701
##   229    125       1   0.6286 0.03274       0.5676        0.696
##   230    124       1   0.6236 0.03287       0.5624        0.691
##   239    121       2   0.6133 0.03314       0.5517        0.682
##   245    117       1   0.6081 0.03327       0.5463        0.677
##   246    116       1   0.6030 0.03340       0.5409        0.672
##   267    112       1   0.5977 0.03354       0.5354        0.667
##   268    111       1   0.5924 0.03367       0.5299        0.662
##   269    110       1   0.5871 0.03379       0.5244        0.657
##   270    108       1   0.5817 0.03392       0.5189        0.652
##   283    104       1   0.5762 0.03405       0.5132        0.647
##   284    103       1   0.5707 0.03419       0.5075        0.642
##   285    101       2   0.5595 0.03444       0.4959        0.631
##   286     99       1   0.5538 0.03456       0.4901        0.626
##   288     98       1   0.5482 0.03468       0.4843        0.621
##   291     97       1   0.5426 0.03479       0.4785        0.615
##   293     94       1   0.5368 0.03490       0.4726        0.610
##   301     91       1   0.5310 0.03502       0.4666        0.604
##   303     89       1   0.5250 0.03514       0.4604        0.599
##   305     87       1   0.5189 0.03526       0.4542        0.593
##   306     86       1   0.5129 0.03537       0.4480        0.587
##   310     85       2   0.5007 0.03558       0.4356        0.576
##   320     82       1   0.4946 0.03568       0.4294        0.570
##   329     81       1   0.4884 0.03577       0.4231        0.564
##   337     79       1   0.4822 0.03586       0.4168        0.558
##   340     78       1   0.4760 0.03594       0.4105        0.552
##   345     77       1   0.4698 0.03601       0.4043        0.546
##   348     76       1   0.4636 0.03607       0.3981        0.540
##   350     75       1   0.4575 0.03612       0.3919        0.534
##   351     74       1   0.4514 0.03617       0.3858        0.528
##   353     73       2   0.4391 0.03623       0.3735        0.516
##   361     70       1   0.4329 0.03626       0.3674        0.510
##   363     69       2   0.4205 0.03629       0.3551        0.498
##   364     67       1   0.4143 0.03629       0.3489        0.492
##   371     65       2   0.4017 0.03629       0.3365        0.479
##   387     60       1   0.3952 0.03629       0.3301        0.473
##   390     59       1   0.3887 0.03629       0.3237        0.467
##   394     58       1   0.3822 0.03627       0.3173        0.460
##   426     55       1   0.3753 0.03628       0.3106        0.454
##   428     54       1   0.3685 0.03627       0.3038        0.447
##   429     53       1   0.3616 0.03625       0.2971        0.440
##   433     52       1   0.3547 0.03622       0.2904        0.433
##   442     51       1   0.3478 0.03618       0.2837        0.426
##   444     50       1   0.3409 0.03612       0.2770        0.420
##   450     48       1   0.3338 0.03607       0.2701        0.413
##   455     47       1   0.3267 0.03600       0.2633        0.405
##   457     46       1   0.3196 0.03592       0.2564        0.398
##   460     44       1   0.3123 0.03584       0.2494        0.391
##   473     43       1   0.3049 0.03575       0.2423        0.384
##   477     42       1   0.2976 0.03564       0.2353        0.376
##   519     39       1   0.2899 0.03554       0.2280        0.369
##   520     38       1   0.2822 0.03543       0.2206        0.361
##   524     37       2   0.2668 0.03514       0.2061        0.345
##   533     34       1   0.2590 0.03498       0.1987        0.337
##   550     32       1   0.2510 0.03481       0.1913        0.329
##   558     30       1   0.2428 0.03463       0.1836        0.321
##   567     28       1   0.2344 0.03445       0.1757        0.313
##   574     27       1   0.2259 0.03425       0.1678        0.304
##   583     26       1   0.2173 0.03401       0.1599        0.295
##   613     24       1   0.2083 0.03378       0.1516        0.286
##   624     23       1   0.1992 0.03351       0.1433        0.277
##   641     22       1   0.1901 0.03320       0.1350        0.268
##   643     21       1   0.1811 0.03283       0.1269        0.258
##   654     20       1   0.1719 0.03243       0.1187        0.249
##   655     19       1   0.1627 0.03196       0.1107        0.239
##   687     18       1   0.1535 0.03145       0.1027        0.229
##   689     17       1   0.1444 0.03088       0.0950        0.220
##   705     16       1   0.1352 0.03025       0.0872        0.210
##   707     15       1   0.1263 0.02954       0.0799        0.200
##   728     14       1   0.1172 0.02878       0.0725        0.190
##   731     13       1   0.1083 0.02794       0.0653        0.180
##   735     12       1   0.0995 0.02703       0.0584        0.169
##   765     10       1   0.0904 0.02606       0.0514        0.159
##   791      9       1   0.0817 0.02499       0.0449        0.149
##   814      7       1   0.0719 0.02387       0.0375        0.138
##   883      4       1   0.0577 0.02303       0.0264        0.126
# 予測した生存曲線を作図する
out_tidy <- tidy(out_surv)
out_tidy
## # A tibble: 186 x 8
##     time n.risk n.event n.censor estimate std.error conf.high conf.low
##    <dbl>  <dbl>   <dbl>    <dbl>    <dbl>     <dbl>     <dbl>    <dbl>
##  1     5    228       1        0    0.996   0.00419     1        0.988
##  2    11    227       3        0    0.983   0.00845     1.00     0.967
##  3    12    224       1        0    0.979   0.00947     0.997    0.961
##  4    13    223       2        0    0.971   0.0113      0.992    0.949
##  5    15    221       1        0    0.966   0.0121      0.990    0.944
##  6    26    220       1        0    0.962   0.0129      0.987    0.938
##  7    30    219       1        0    0.958   0.0136      0.984    0.933
##  8    31    218       1        0    0.954   0.0143      0.981    0.927
##  9    53    217       2        0    0.945   0.0157      0.975    0.917
## 10    54    215       1        0    0.941   0.0163      0.972    0.911
## # … with 176 more rows
p <- ggplot(data = out_tidy,
            mapping = aes(x = time,
                          y = estimate))
p + geom_line() + 
    geom_ribbon(mapping = aes(ymin = conf.low,
                              ymax = conf.high),
                alpha = 0.2) + 
    labs(title = "Kaplan-Meier method")

備考

カプランマイヤー法
イベントが発生するまでの時間(生存時間)分析する生存時間分析の手法. 期間内にイベントが起きなかった例を「打ち切り」として分析に含めることが出来る.

手術から再発までの時間を分析する際に,途中で転院したり,治療を中断したケースは,少なくとも再発していないためイベントに該当しないが,欠損値として分析から除外するとバイアスがかかる.打ち切り例は,イベントが起きる(打ち切りになる)までは生存率の計算に寄与し,打ち切り後はケースから除外される

6.6 グループ化したデータの分析およびリスト列の取り扱い

broomパッケージを使うと,データのサブセットにモデルを適用し, それぞれのサブセットごとに当てはめたモデルの結果をまとめた表を作ることが可能

# gapminderデータセットから1977年のEuropeのデータを抽出し
# モデルを作成する
eu77 <- gapminder %>% filter(continent == "Europe", year == 1977)
fit <- lm(formula = lifeExp ~ log(gdpPercap),
          data = eu77)
summary(fit)
## 
## Call:
## lm(formula = lifeExp ~ log(gdpPercap), data = eu77)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.4956 -1.0306  0.0935  1.1755  3.7125 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      29.489      7.161   4.118 0.000306 ***
## log(gdpPercap)    4.488      0.756   5.936 2.17e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.114 on 28 degrees of freedom
## Multiple R-squared:  0.5572, Adjusted R-squared:  0.5414 
## F-statistic: 35.24 on 1 and 28 DF,  p-value: 2.173e-06
fit_comp <- tidy(fit, conf.int = TRUE)
fit_comp
## # A tibble: 2 x 7
##   term           estimate std.error statistic    p.value conf.low conf.high
##   <chr>             <dbl>     <dbl>     <dbl>      <dbl>    <dbl>     <dbl>
## 1 (Intercept)       29.5      7.16       4.12 0.000306      14.8      44.2 
## 2 log(gdpPercap)     4.49     0.756      5.94 0.00000217     2.94      6.04
# 推定値の信頼区間を求める
p <- ggplot(data = fit_comp,
            mapping = aes(x = term,
                          y = estimate,
                          ymin = conf.low,
                          ymax = conf.high))
p + geom_pointrange() + 
    coord_flip()

dplyrとbroomを使うことで大陸ー年ごとに層別化されたデータを コンパクトかつtidyな方法で処理・解析できる

#nest: tidyrパッケージの関数

out_le <- gapminder %>%
  group_by(continent, year) %>%
  nest() # .key で名前を変更できる
head(out_le)
## # A tibble: 6 x 3
## # Groups:   continent, year [6]
##   continent  year data             
##   <fct>     <int> <list>           
## 1 Asia       1952 <tibble [33 × 4]>
## 2 Asia       1957 <tibble [33 × 4]>
## 3 Asia       1962 <tibble [33 × 4]>
## 4 Asia       1967 <tibble [33 × 4]>
## 5 Asia       1972 <tibble [33 × 4]>
## 6 Asia       1977 <tibble [33 × 4]>
?nest
# nest()関数によって,
# 表形式のまま複雑なオブジェクト(リスト)を保存することができる
# unnestを使って対象のデータを取り出すことが可能
out_le %>% filter(continent == "Europe" & year == 1977)%>% 
  unnest(cols = c(data))
## # A tibble: 30 x 6
## # Groups:   continent, year [1]
##    continent  year country                lifeExp      pop gdpPercap
##    <fct>     <int> <fct>                    <dbl>    <int>     <dbl>
##  1 Europe     1977 Albania                   68.9  2509048     3533.
##  2 Europe     1977 Austria                   72.2  7568430    19749.
##  3 Europe     1977 Belgium                   72.8  9821800    19118.
##  4 Europe     1977 Bosnia and Herzegovina    69.9  4086000     3528.
##  5 Europe     1977 Bulgaria                  70.8  8797022     7612.
##  6 Europe     1977 Croatia                   70.6  4318673    11305.
##  7 Europe     1977 Czech Republic            70.7 10161915    14800.
##  8 Europe     1977 Denmark                   74.7  5088419    20423.
##  9 Europe     1977 Finland                   72.5  4738902    15605.
## 10 Europe     1977 France                    73.8 53165019    18293.
## # … with 20 more rows
# リスト列は,
# 1. 列内のオブジェクトに対してまとめて簡潔かつtidyな操作ができる

# まずfit_ols関数を作成し,対象のデータフレームに対して線形モデルを実行することが可能
fit_ols <- function(df){
  lm(formula = lifeExp ~ log(gdpPercap),
     data = df)
}
# fit_ols関数を,リスト列を含むそれぞれの行に順番にmapする
out_le <- gapminder %>%
  group_by(continent, year) %>%
  nest() %>%
  mutate(model = map(data, fit_ols))

out_le %>% filter(continent == "Asia" & year == 1977) %>%
  unnest(cols = c(data))
## # A tibble: 33 x 7
## # Groups:   continent, year [1]
##    continent  year country          lifeExp       pop gdpPercap model 
##    <fct>     <int> <fct>              <dbl>     <int>     <dbl> <list>
##  1 Asia       1977 Afghanistan         38.4  14880372      786. <lm>  
##  2 Asia       1977 Bahrain             65.6    297410    19340. <lm>  
##  3 Asia       1977 Bangladesh          46.9  80428306      660. <lm>  
##  4 Asia       1977 Cambodia            31.2   6978607      525. <lm>  
##  5 Asia       1977 China               64.0 943455000      741. <lm>  
##  6 Asia       1977 Hong Kong, China    73.6   4583700    11186. <lm>  
##  7 Asia       1977 India               54.2 634000000      813. <lm>  
##  8 Asia       1977 Indonesia           52.7 136725000     1383. <lm>  
##  9 Asia       1977 Iran                57.7  35480679    11889. <lm>  
## 10 Asia       1977 Iraq                60.4  11882916    14688. <lm>  
## # … with 23 more rows
# 初めから一回で行う
# やることをまとめると
# 1. (大陸+年代)別に線形モデル作成する
# 2. それぞれのモデルから要約統計量を抽出する
# 3. この結果のネストを解除して,切片項とオセアニアデータを削除する(前者は便宜上,後者は国が少ないから)
fit_ols <- function(.df){
  lm(formula = lifeExp ~ log(gdpPercap),
     data = .df)
}
out_tidy <- gapminder %>%
  group_by(continent, year) %>%
  nest() %>%
  mutate(model = map(data, fit_ols),
         tidied = map(model, tidy)) %>%
  unnest(tidied) %>% 
  select(!c(data, model)) %>% # data列とmodel列はいらない
  filter(term %nin% "(Intercept)",
         continent %nin% "Oceania")
out_tidy %>% as.data.frame() %>%
  round_df() %>% slice_sample(n = 5, replace = TRUE)
##   continent year           term estimate std.error statistic p.value
## 1  Americas 2002 log(gdpPercap)     5.05      0.84      5.99       0
## 2    Europe 2002 log(gdpPercap)     3.74      0.45      8.40       0
## 3    Africa 1987 log(gdpPercap)     6.48      0.90      7.22       0
## 4    Europe 2002 log(gdpPercap)     3.74      0.45      8.40       0
## 5      Asia 1952 log(gdpPercap)     4.16      1.25      3.33       0
# 以上のコードにより,
# 大陸内で各年ごとに一人当たりの対数変換されたGDPと平均寿命との関係について解析した回帰分析の結果がtidyな形で得られた

# 得られたモデルの推定値はそれらのグループを確認し,図示する為に利用する

# 大陸ごとに層別した年区切りのGDPと平均寿命の関係における推定値
p <- ggplot(data = out_tidy,
            mapping = aes(x = year, y = estimate,
                          ymin = estimate - 2*std.error,
                          ymax = estimate + 2*std.error,
                          group = continent, color = continent))
p + geom_pointrange(position = position_dodge(width = 1)) + # すこしずらす
    scale_x_continuous(breaks = unique(gapminder$year)) + # 調査年に合わせる
    theme(legend.position = "top") + 
    labs(x = "Year", y = "Estimate", color = "Continent")

6.7 限界効果の可視化

モデルの偏効果・限界効果を推定して図示することが, モデルを正確に解釈し,有用な予測を示すための一般的な方法

限界効果: 説明変数が一単位ふえたときの目的変数の増加量

偏効果: 他の独立した説明変数の値が固定されている時に,説明変数が一単位増えた場合の目的変数の増加量

# 限界効果を可視化するパッケージ
library(margins)

これから行うこと 扱うデータセット - gss_sm: アメリカ合衆国の一般的な社会調査データ 目的変数 - obama: オバマに投票した場合に1, それ以外は0 説明変数 - age(年齢): 離散値 - polviews(リベラル・保守の傾向): Extremely Conservative ~ Extremely Liberal. Moderateが参照カテゴリ - race(人種): White, Black, Other. Whiteが参照カテゴリ - sex(性別): Male, Female. Maleが参照カテゴリ ※参照カテゴリ: 比較の基準となる変数. 方法 - ロジスティック回帰.人種と性別の間に交互作用があるとする

# まずpolviewsの参照カテゴリを変更する
# relevel()関数で可能
gss_sm$polviews_m <- relevel(gss_sm$polviews, ref = "Moderate")

out_bo <- glm(formula = obama ~ polviews_m + sex*race,
              family = "binomial", 
              data = gss_sm)
summary(out_bo)
## 
## Call:
## glm(formula = obama ~ polviews_m + sex * race, family = "binomial", 
##     data = gss_sm)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.9045  -0.5541   0.1772   0.5418   2.2437  
## 
## Coefficients:
##                                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                       0.296493   0.134091   2.211  0.02703 *  
## polviews_mExtremely Liberal       2.372950   0.525045   4.520 6.20e-06 ***
## polviews_mLiberal                 2.600031   0.356666   7.290 3.10e-13 ***
## polviews_mSlightly Liberal        1.293172   0.248435   5.205 1.94e-07 ***
## polviews_mSlightly Conservative  -1.355277   0.181291  -7.476 7.68e-14 ***
## polviews_mConservative           -2.347463   0.200384 -11.715  < 2e-16 ***
## polviews_mExtremely Conservative -2.727384   0.387210  -7.044 1.87e-12 ***
## sexFemale                         0.254866   0.145370   1.753  0.07956 .  
## raceBlack                         3.849526   0.501319   7.679 1.61e-14 ***
## raceOther                        -0.002143   0.435763  -0.005  0.99608    
## sexFemale:raceBlack              -0.197506   0.660066  -0.299  0.76477    
## sexFemale:raceOther               1.574829   0.587657   2.680  0.00737 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2247.9  on 1697  degrees of freedom
## Residual deviance: 1345.9  on 1686  degrees of freedom
##   (1169 observations deleted due to missingness)
## AIC: 1369.9
## 
## Number of Fisher Scoring iterations: 6
# margins()関数を使ってそれぞれの変数の限界効果を計算する
bo_m <- margins(out_bo)
summary(bo_m)
##                            factor     AME     SE        z      p   lower
##            polviews_mConservative -0.4119 0.0283 -14.5394 0.0000 -0.4674
##  polviews_mExtremely Conservative -0.4538 0.0420 -10.7971 0.0000 -0.5361
##       polviews_mExtremely Liberal  0.2681 0.0295   9.0996 0.0000  0.2103
##                 polviews_mLiberal  0.2768 0.0229  12.0736 0.0000  0.2319
##   polviews_mSlightly Conservative -0.2658 0.0330  -8.0596 0.0000 -0.3304
##        polviews_mSlightly Liberal  0.1933 0.0303   6.3896 0.0000  0.1340
##                         raceBlack  0.4032 0.0173  23.3568 0.0000  0.3694
##                         raceOther  0.1247 0.0386   3.2297 0.0012  0.0490
##                         sexFemale  0.0443 0.0177   2.5073 0.0122  0.0097
##    upper
##  -0.3564
##  -0.3714
##   0.3258
##   0.3218
##  -0.2011
##   0.2526
##   0.4371
##   0.2005
##   0.0789
# marginsパッケージに独自の可視化メソッドがある
# ここではggplot2を用いて可視化する

# まずtibbleに変換する
bo_gg <- as_tibble(summary(bo_m))
bo_gg
## # A tibble: 9 x 7
##   factor                            AME     SE      z         p    lower   upper
##   <chr>                           <dbl>  <dbl>  <dbl>     <dbl>    <dbl>   <dbl>
## 1 polviews_mConservative        -0.412  0.0283 -14.5  6.82e- 48 -0.467   -0.356 
## 2 polviews_mExtremely Conserva… -0.454  0.0420 -10.8  3.55e- 27 -0.536   -0.371 
## 3 polviews_mExtremely Liberal    0.268  0.0295   9.10 9.07e- 20  0.210    0.326 
## 4 polviews_mLiberal              0.277  0.0229  12.1  1.46e- 33  0.232    0.322 
## 5 polviews_mSlightly Conservat… -0.266  0.0330  -8.06 7.65e- 16 -0.330   -0.201 
## 6 polviews_mSlightly Liberal     0.193  0.0303   6.39 1.66e- 10  0.134    0.253 
## 7 raceBlack                      0.403  0.0173  23.4  1.18e-120  0.369    0.437 
## 8 raceOther                      0.125  0.0386   3.23 1.24e-  3  0.0490   0.200 
## 9 sexFemale                      0.0443 0.0177   2.51 1.22e-  2  0.00967  0.0789
# factorのラベルを編集する
prefixes <- c("polviews_m", "sex")
bo_gg$factor <- prefix_strip(bo_gg$factor, prefixes)
bo_gg$factor <- prefix_replace(bo_gg$factor, "race", "Race: ")

bo_gg %>% select(factor, AME, lower, upper)
## # A tibble: 9 x 4
##   factor                     AME    lower   upper
##   <chr>                    <dbl>    <dbl>   <dbl>
## 1 Conservative           -0.412  -0.467   -0.356 
## 2 Extremely Conservative -0.454  -0.536   -0.371 
## 3 Extremely Liberal       0.268   0.210    0.326 
## 4 Liberal                 0.277   0.232    0.322 
## 5 Slightly Conservative  -0.266  -0.330   -0.201 
## 6 Slightly Liberal        0.193   0.134    0.253 
## 7 Race: Black             0.403   0.369    0.437 
## 8 Race: Other             0.125   0.0490   0.200 
## 9 Female                  0.0443  0.00967  0.0789
# 作図
# 平均限界効果の可視化
p <- ggplot(data = bo_gg,
            mapping = aes(x = reorder(factor, AME),
                          y = AME,
                          ymin = lower, ymax = upper))
p + geom_hline(yintercept = 0, color = "gray80") + 
    geom_pointrange() + 
    coord_flip() + 
    labs(x = NULL, y = "Average Marginal Effect")

# 変数の条件付き効果を作図する場合
# 条件付き効果ってなんだ?
pv_cp <- cplot(out_bo, x = "sex", draw = FALSE)
pv_cp
##    xvals     yvals     upper     lower
## 1   Male 0.5735849 0.6378653 0.5093045
## 2 Female 0.6344507 0.6887845 0.5801169
p <- ggplot(data = pv_cp,
            mapping = aes(x = reorder(xvals, yvals), 
                          y = yvals,
                          ymin = lower, ymax = upper))
p + geom_hline(yintercept = 0, color = "gray80") + 
    geom_pointrange() + 
    coord_flip() + 
    labs(x = NULL, y = "Conditional Effect")

6.8 複雑な調査データの可視化

社会科学では,複雑な調査デザインの元,収集されたデータを解析する. 層別化,反復重み付け(対照グループと比較するため),クラスター構造をもったデータを扱うことも多い. これを扱う方法として,Thomas Lumleyが開発したsurveyパッケージおよびGerg Freedman Ellisが開発したsrvyrパッケージがある. srvyrパッケージはsurveyパッケージをtidyverseの文法(パイプライン)で書けるようにしたもの.

扱うデータ: gss_lon.
1972年にGSS(General Social Survey)が始まって以来のGSSの様々な調査値の変動に関するサブセットが含まれている 行うこと: 1976~2016年までのそれぞれの調査年における,人種ごとの重み付き教育歴の分布を推定する

# パッケージの読み込み
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(srvyr)
## 
## Attaching package: 'srvyr'
## The following object is masked from 'package:stats':
## 
##     filter
# データの確認
glimpse(gss_lon)
## Rows: 62,466
## Columns: 25
## $ year        <dbl> 1972, 1972, 1972, 1972, 1972, 1972, 1972, 1972, 1972, 19…
## $ id          <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ ballot      <labelled> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ age         <labelled> 23, 70, 48, 27, 61, 26, 28, 27, 21, 30, 30, 56, 54,…
## $ degree      <fct> Bachelor, Lt High School, High School, Bachelor, High Sc…
## $ race        <fct> White, White, White, White, White, White, White, White, …
## $ sex         <fct> Female, Male, Female, Female, Female, Male, Male, Male, …
## $ siblings    <fct> 3, 4, 5, 5, 2, 1, 6+, 1, 2, 6+, 6+, 6+, 2, 2, 0, 6+, 0, …
## $ kids        <fct> 0, 4+, 4+, 0, 2, 0, 2, 0, 2, 4+, 1, 4+, 1, 2, 4+, 2, 2, …
## $ bigregion   <fct> Midwest, Midwest, Midwest, Midwest, Midwest, Midwest, Mi…
## $ income16    <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ religion    <fct> Jewish, Catholic, Protestant, Other, Protestant, Protest…
## $ marital     <fct> Never Married, Married, Married, Married, Married, Never…
## $ padeg       <fct> Lt High School, Lt High School, Lt High School, Bachelor…
## $ madeg       <fct> NA, Lt High School, Lt High School, High School, Lt High…
## $ partyid     <fct> "Ind,near Dem", "Not Str Democrat", "Independent", "Not …
## $ polviews    <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ happy       <fct> Not Too Happy, Not Too Happy, Pretty Happy, Not Too Happ…
## $ partners_rc <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ grass       <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ zodiac      <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ pres12      <labelled> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ wtssall     <dbl> 0.4446, 0.8893, 0.8893, 0.8893, 0.8893, 0.4446, 0.4446, …
## $ vpsu        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ vstrat      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
options(survey.lonely.psu = "adjust")
options(na.action = "na.pass")


gss_wt <- subset(gss_lon, year > 1974) %>%
  mutate(stratvar = interaction(year, vstrat)) %>%
  as_survey_design(ids = vpsu,
                   strata = stratvar,
                   weights = wtssall,
                   nest = TRUE)

# stratvar列: 階層構造の情報である年ごとのサンプリング層の情報
#   interaction()関数を使って,yearとvstrat変数を組み合わせた,それぞれの年についての階層情報ベクトル
# 出力として,「1976.7001」のようなyear.vstratの形になる
# as_survey_design()関数を使って,調査デザインに関する情報を追加する
# サンプリングID,層(strata),重み付け(weight)に関する情報を追加する


# survey_mean()関数を使って,1976~2016年それぞれの年における人種ごとの教育歴の分布を算出する
out_grp <- gss_wt %>% 
  filter(year %in% seq(1976, 2016, by = 4)) %>% 
  group_by(year, race, degree) %>% 
  summarize(prop = survey_mean(na.rm = TRUE))
out_grp
## # A tibble: 162 x 5
## # Groups:   year, race [30]
##     year race  degree            prop prop_se
##    <dbl> <fct> <fct>            <dbl>   <dbl>
##  1  1976 White Lt High School 0.327   0.0160 
##  2  1976 White High School    0.517   0.0161 
##  3  1976 White Junior College 0.0128  0.00298
##  4  1976 White Bachelor       0.101   0.00955
##  5  1976 White Graduate       0.0392  0.00642
##  6  1976 White <NA>           0.00285 0.00151
##  7  1976 Black Lt High School 0.558   0.0603 
##  8  1976 Black High School    0.335   0.0476 
##  9  1976 Black Junior College 0.0423  0.0192 
## 10  1976 Black Bachelor       0.0577  0.0238 
## # … with 152 more rows
# 確認
out_grp %>% group_by(year, race) %>% 
  summarize(sum = sum(prop))
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## # A tibble: 30 x 3
## # Groups:   year [10]
##     year race    sum
##    <dbl> <fct> <dbl>
##  1  1976 White  1   
##  2  1976 Black  1   
##  3  1976 Other  1.00
##  4  1980 White  1   
##  5  1980 Black  1   
##  6  1980 Other  1   
##  7  1984 White  1   
##  8  1984 Black  1   
##  9  1984 Other  1   
## 10  1988 White  1   
## # … with 20 more rows
# 度数の比率は各年の人種ごとに合計されて1になる

# 各年の人種と教育歴の全ての組み合わせの合計で1にしたい場合は
# interaction()関数を使って交互作用変数を考えると良い
out_mrg <- gss_wt %>% 
  filter(year %in% seq(1976, 2016, by = 4)) %>% 
  mutate(racedeg = interaction(race, degree)) %>% 
  group_by(year, racedeg) %>% 
  summarize(prop = survey_mean(na.rm = TRUE))
out_mrg
## # A tibble: 155 x 4
## # Groups:   year [10]
##     year racedeg                 prop prop_se
##    <dbl> <fct>                  <dbl>   <dbl>
##  1  1976 White.Lt High School 0.297   0.0146 
##  2  1976 Black.Lt High School 0.0470  0.00837
##  3  1976 Other.Lt High School 0.00194 0.00138
##  4  1976 White.High School    0.469   0.0159 
##  5  1976 Black.High School    0.0282  0.00593
##  6  1976 Other.High School    0.00324 0.00166
##  7  1976 White.Junior College 0.0117  0.00268
##  8  1976 Black.Junior College 0.00356 0.00162
##  9  1976 White.Bachelor       0.0916  0.00883
## 10  1976 Black.Bachelor       0.00486 0.00213
## # … with 145 more rows
# 確認
out_mrg %>% group_by(year) %>% 
  summarise(total = sum(prop))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 10 x 2
##     year total
##    <dbl> <dbl>
##  1  1976  1   
##  2  1980  1   
##  3  1984  1   
##  4  1988  1.  
##  5  1996  1   
##  6  2000  1.00
##  7  2004  1   
##  8  2008  1.00
##  9  2012  1   
## 10  2016  1.00
# race.degreeのような形で変数を扱いたくない場合も
# raceとdegreeをそれぞれ別の列で扱いたい
# separate()関数を使うことで対象列の名前を二つの列に分割できる
out_mrg <- gss_wt %>% 
  filter(year %in% seq(1976, 2016, by = 4)) %>% 
  mutate(racedeg = interaction(race, degree)) %>% 
  group_by(year, racedeg) %>% 
  summarize(prop = survey_mean(na.rm = TRUE)) %>% 
  separate(racedeg, sep = "\\.", into = c("race", "degree"))
out_mrg
## # A tibble: 155 x 5
## # Groups:   year [10]
##     year race  degree            prop prop_se
##    <dbl> <chr> <chr>            <dbl>   <dbl>
##  1  1976 White Lt High School 0.297   0.0146 
##  2  1976 Black Lt High School 0.0470  0.00837
##  3  1976 Other Lt High School 0.00194 0.00138
##  4  1976 White High School    0.469   0.0159 
##  5  1976 Black High School    0.0282  0.00593
##  6  1976 Other High School    0.00324 0.00166
##  7  1976 White Junior College 0.0117  0.00268
##  8  1976 Black Junior College 0.00356 0.00162
##  9  1976 White Bachelor       0.0916  0.00883
## 10  1976 Black Bachelor       0.00486 0.00213
## # … with 145 more rows

年別に層別化した場合,どのグラフを使うと良いのかは専門家でも意見が分かれる 単年度なら棒グラフでも良いが,長期間ののデータなら折れ線グラフを使うのも良い

# GSSの結果をダイナマイトプロット(棒グラフ±SE)で図示する

p <- ggplot(data = subset(out_grp, race %nin% "Other"),
            mapping = aes(x = degree,
                          y = prop,
                          ymin = prop - 2*prop_se,
                          ymax = prop + 2*prop_se,
                          fill = race,
                          color = race,
                          group = race))
dodge <- position_dodge(width = 0.9)
p + geom_col(position = dodge, alpha = 0.2) + 
    geom_errorbar(position = dodge, width = 0.2) + 
    scale_x_discrete(labels = scales::wrap_format(10)) + # 長いラベルを行単位に分割する
    scale_y_continuous(labels = scales::percent) + 
    scale_color_brewer(type = "qual", palette = "Dark2") + 
    scale_fill_brewer(type = "qual", palette = "Dark2") + 
    labs(x = NULL, y = "Percent",
         title = "Educational Attainment by Race",
         fill = "Race", color = "Race") + 
    facet_wrap(~ year, ncol = 2) + 
    theme(legend.position = "top")

# NAが残ったり,グラフの挙動なんかおかしい?

各年内の内訳を見るには簡単. しかしながら年の経過で比較するのは非常に難しい

次に,教育歴をfacetし,x軸を年にする

data <- out_grp %>% 
  subset(race %nin% "Other") %>% 
  subset(degree %nin% NA)
head(data)
## # A tibble: 6 x 5
## # Groups:   year, race [2]
##    year race  degree           prop prop_se
##   <dbl> <fct> <fct>           <dbl>   <dbl>
## 1  1976 White Lt High School 0.327  0.0160 
## 2  1976 White High School    0.517  0.0161 
## 3  1976 White Junior College 0.0128 0.00298
## 4  1976 White Bachelor       0.101  0.00955
## 5  1976 White Graduate       0.0392 0.00642
## 6  1976 Black Lt High School 0.558  0.0603
head(out_grp)
## # A tibble: 6 x 5
## # Groups:   year, race [1]
##    year race  degree            prop prop_se
##   <dbl> <fct> <fct>            <dbl>   <dbl>
## 1  1976 White Lt High School 0.327   0.0160 
## 2  1976 White High School    0.517   0.0161 
## 3  1976 White Junior College 0.0128  0.00298
## 4  1976 White Bachelor       0.101   0.00955
## 5  1976 White Graduate       0.0392  0.00642
## 6  1976 White <NA>           0.00285 0.00151
p <- ggplot(data = data,
            mapping = aes(x = year, y = prop,
                          ymin = prop - 2*prop_se,
                          ymax = prop + 2*prop_se,
                          color = race,
                          fill = race, 
                          group = race))

p + geom_ribbon(alpha = 0.3, aes(color = NULL)) + 
    geom_line() + 
    facet_wrap(~ degree, ncol = 1) + 
    scale_y_continuous(labels = scales::percent) + 
    labs(x = NULL, y = "Percent",
         title = "Educational Attainment \nby Race",
         subtitle = "GSS 1976-2016",
         fill = "Race", color = "Race") + 
    theme(legend.position = "top")

# graduateがの1976年のデータがない?

6.9 次の一手

モデルを推定し,その結果を可視化する時に難しいのは,モデルから正しく数値を計算・抽出すること
そのためにもモデルの理解や関数の中身を理解する必要がある

6.9.1 基本機能によるモデルの可視化

glimpse(gapminder)
## Rows: 1,704
## Columns: 6
## $ country   <fct> Afghanistan, Afghanistan, Afghanistan, Afghanistan, Afghan…
## $ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia…
## $ year      <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997…
## $ lifeExp   <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40…
## $ pop       <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, …
## $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134…
out <- lm(formula = lifeExp ~ log(gdpPercap) + pop + continent,
          data = gapminder)
summary(out)
## 
## Call:
## lm(formula = lifeExp ~ log(gdpPercap) + pop + continent, data = gapminder)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -26.7090  -3.4832   0.4396   4.4062  18.6914 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       1.476e+00  1.352e+00   1.091    0.275    
## log(gdpPercap)    6.524e+00  1.824e-01  35.778  < 2e-16 ***
## pop               9.990e-09  1.650e-09   6.056 1.72e-09 ***
## continentAmericas 6.729e+00  5.507e-01  12.218  < 2e-16 ***
## continentAsia     5.157e+00  4.880e-01  10.567  < 2e-16 ***
## continentEurope   9.290e+00  5.997e-01  15.491  < 2e-16 ***
## continentOceania  8.965e+00  1.521e+00   5.896 4.49e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.965 on 1697 degrees of freedom
## Multiple R-squared:  0.7102, Adjusted R-squared:  0.7092 
## F-statistic: 693.3 on 6 and 1697 DF,  p-value: < 2.2e-16
plot(out, which = c(1, 2), ask = FALSE) # wichiで最初の1, 2のグラフを出力すると指定した

coefplotパッケージを使う

library(coefplot)
out <- lm(formula = lifeExp ~ log(gdpPercap) + log(pop) + continent,
          data = gapminder)
coefplot(out, sort = "magnitude", intercept = FALSE)

6.9.2 開発中のパッケージ

inferパッケージ - 開発の初期段階だが,すでに有用なものがある

6.9.3 ggplotの拡張に関するパッケージ

GGallyパッケージ - 複雑な図の作成を簡略化するためのパッケージ - あくまでも研究者がデータセットの内容を迅速に確認するため - さらなる調査に向けて探索的にデータを読み解いていくのが目的

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
organdata_sm <- organdata %>% 
  select(donors, pop_dens, pubhealth, 
         roads, consent_law)
?ggpairs
# upperやlowerは
ggpairs(data = organdata_sm,
        mapping = aes(color = consent_law),
        upper = list(continuous = wrap("density"), combo = "box_no_facet"),
        lower = list(continuous = wrap("points"), combo = wrap("dot_no_facet")))
## Warning: Removed 34 rows containing non-finite values (stat_density).
## Warning: Removed 34 rows containing non-finite values (stat_density2d).
## Warning: Removed 37 rows containing non-finite values (stat_density2d).
## Warning: Removed 34 rows containing non-finite values (stat_density2d).
## Warning: Removed 34 rows containing non-finite values (stat_boxplot).
## Warning: Removed 34 rows containing missing values (geom_point).
## Warning: Removed 17 rows containing non-finite values (stat_density).
## Warning: Removed 21 rows containing non-finite values (stat_density2d).
## Warning: Removed 17 rows containing non-finite values (stat_density2d).
## Warning: Removed 17 rows containing non-finite values (stat_boxplot).
## Warning: Removed 37 rows containing missing values (geom_point).
## Warning: Removed 21 rows containing missing values (geom_point).
## Warning: Removed 21 rows containing non-finite values (stat_density).
## Warning: Removed 21 rows containing non-finite values (stat_density2d).
## Warning: Removed 21 rows containing non-finite values (stat_boxplot).
## Warning: Removed 34 rows containing missing values (geom_point).
## Warning: Removed 17 rows containing missing values (geom_point).
## Warning: Removed 21 rows containing missing values (geom_point).
## Warning: Removed 17 rows containing non-finite values (stat_density).
## Warning: Removed 17 rows containing non-finite values (stat_boxplot).
## Warning: Removed 34 rows containing missing values (geom_point).
## Warning: Removed 17 rows containing missing values (geom_point).
## Warning: Removed 21 rows containing missing values (geom_point).
## Warning: Removed 17 rows containing missing values (geom_point).

ggpairs(data = organdata_sm,
        mapping = aes(color = consent_law),
        upper = list(continuous = wrap("density", combo = "box_no_facet")))
## Warning: Removed 34 rows containing non-finite values (stat_density).
## Warning: Ignoring unknown parameters: combo
## Warning: Removed 34 rows containing non-finite values (stat_density2d).
## Warning: Ignoring unknown parameters: combo
## Warning: Removed 37 rows containing non-finite values (stat_density2d).
## Warning: Ignoring unknown parameters: combo
## Warning: Removed 34 rows containing non-finite values (stat_density2d).
## Warning: Removed 34 rows containing non-finite values (stat_boxplot).
## Warning: Removed 34 rows containing missing values (geom_point).
## Warning: Removed 17 rows containing non-finite values (stat_density).
## Warning: Ignoring unknown parameters: combo
## Warning: Removed 21 rows containing non-finite values (stat_density2d).
## Warning: Ignoring unknown parameters: combo
## Warning: Removed 17 rows containing non-finite values (stat_density2d).
## Warning: Removed 17 rows containing non-finite values (stat_boxplot).
## Warning: Removed 37 rows containing missing values (geom_point).
## Warning: Removed 21 rows containing missing values (geom_point).
## Warning: Removed 21 rows containing non-finite values (stat_density).
## Warning: Ignoring unknown parameters: combo
## Warning: Removed 21 rows containing non-finite values (stat_density2d).
## Warning: Removed 21 rows containing non-finite values (stat_boxplot).
## Warning: Removed 34 rows containing missing values (geom_point).
## Warning: Removed 17 rows containing missing values (geom_point).
## Warning: Removed 21 rows containing missing values (geom_point).
## Warning: Removed 17 rows containing non-finite values (stat_density).
## Warning: Removed 17 rows containing non-finite values (stat_boxplot).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 34 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 17 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 21 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 17 rows containing non-finite values (stat_bin).